DataDog · vlad-scherbich · May 17, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
@@ -183,6 +183,15 @@ class Sample
     // This is useful when the Sample object is embedded and will be destroyed later
     bool export_sample();
 
+    // Flip the sign on the heap_space value so this sample is emitted as a
+    // tombstone (negative delta) by a subsequent export_sample() call.
+    // NOT idempotent: each call toggles the sign, so calling twice returns the
+    // original positive value. Heap-tracker callers must apply this exactly
+    // once per REMOVE event (see the tombstone_applied guard in
+    // _memalloc_heap.cpp) so retries on transient libdatadog rejection emit a
+    // stable negative value instead of toggling sign on every attempt.
+    void negate_heap_space();
+
     static ProfileBorrow profile_borrow();
     static void postfork_child();
     static void cleanup();

@@ -431,6 +431,20 @@ Datadog::Sample::export_sample()
     return ProfilerState::get().profile_state.collect(sample, endtime_ns);
 }
 
+void
+Datadog::Sample::negate_heap_space()
+{
+    // Flip the sign on heap_space so a subsequent export_sample() emits this
+    // sample as a tombstone. libdatadog accepts negative i64 values
+    // (sample.rs Value is &[i64] with no sign validation).
+    if (0U != (type_mask & SampleType::Heap)) {
+        const size_t heap_space_idx = ProfilerState::get().profile_state.val().heap_space;
+        if (heap_space_idx < values.size()) {
+            values[heap_space_idx] = -values[heap_space_idx];
+        }
+    }
+}
+
 bool
 Datadog::Sample::push_cputime(int64_t cputime, int64_t count)
 {

@@ -128,6 +128,35 @@ class heap_tracker_t
     void postfork_child();
 
   private:
+    /* Delta-export change event. ADD events reference the live traceback in
+     * allocs_m via raw pointer; REMOVE events own the extracted unique_ptr.
+     *
+     * The raw pointer in ADD is valid until export because the traceback is
+     * owned by either allocs_m or a subsequent REMOVE event in pending_changes
+     * (both are cleared together at export time). REMOVE events emit a
+     * negative tombstone — heap_space is negated lazily at emit time so an
+     * earlier ADD that aliases the same traceback (same ptr, same snapshot
+     * interval) reads the original positive value. `tombstone_applied` guards
+     * against re-negation when an emit fails and we retry. */
+    struct change_event
+    {
+        enum Kind : uint8_t
+        {
+            ADD,
+            REMOVE
+        };
+        void* ptr;
+        traceback_t* tb;
+        std::unique_ptr<traceback_t> owner; // set for REMOVE, null for ADD
+        Kind kind;
+        bool tombstone_applied = false; // REMOVE only; true after heap_space is negated
+        uint8_t failed_attempts = 0;    // export retries counter; events exceeding MAX are dropped
+    };
+
+    /* Cap retries on a single event before dropping it, so a persistent
+     * libdatadog rejection cannot grow pending_changes without bound. */
+    static constexpr uint8_t MAX_EXPORT_RETRIES = 2;
+
     uint32_t next_sample_size_no_cpython(uint32_t sample_size);
 
     /* This function is called from heap_tracker_t::postfork_child() as part of
@@ -161,6 +190,31 @@ class heap_tracker_t
 
     /* Initial capacity of the allocations map */
     static constexpr size_t INITAL_ALLOC_MAP_CAPACITY = 512;
+
+    /* Delta export state. pending_changes accumulates ADD/REMOVE events between
+     * snapshots; export_heap_no_cpython drains it and emits the deltas.
+     *
+     * Pre-reserved at construction so steady-state push_back is allocation-free.
+     * On pathological bursts the vector may reallocate. The reallocation goes
+     * through ::operator new (system malloc), NOT PyMem_Malloc, so it does not
+     * reenter the Python allocator hooks where untrack_no_cpython runs from.
+     * (Reentry would also be caught by memalloc_reentrant_guard_t earlier in
+     * the call.) Dropping events at the push site instead would risk
+     * use-after-clear on a queued ADD whose traceback gets pool_put'd by a
+     * matching REMOVE that overflowed — letting the vector grow is the safe
+     * option. */
+    static constexpr size_t PENDING_CHANGES_RESERVE = 4096;
+    std::vector<change_event> pending_changes;
+
+    /* Pair-collapse side map: ptr -> index of its pending ADD event in
+     * pending_changes. When untrack arrives for a ptr that has a still-pending
+     * ADD (i.e., the allocation was tracked and freed within the same snapshot
+     * interval), both events are canceled before reaching libdatadog — the ADD
+     * is marked `collapsed`, no REMOVE is queued, and the traceback returns to
+     * the pool immediately. Saves ~2 Profile_add2 calls per churn pair, which
+     * dominates CPU on alloc-then-free hot loops (see PR #18125 review). */
+    static constexpr size_t PENDING_ADD_IDX_RESERVE = 1024;
+    HeapMapType<void*, size_t> pending_add_idx;
 };
 
 // Pool implementation
@@ -223,6 +277,8 @@ heap_tracker_t::heap_tracker_t(uint32_t sample_size_val)
     pool.reserve(POOL_CAPACITY);
     // Pre-allocate map capacity to avoid rehashing during ramp-up.
     allocs_m.reserve(INITAL_ALLOC_MAP_CAPACITY);
+    pending_changes.reserve(PENDING_CHANGES_RESERVE);
+    pending_add_idx.reserve(PENDING_ADD_IDX_RESERVE);
 }
 
 void
@@ -231,9 +287,48 @@ heap_tracker_t::untrack_no_cpython(void* ptr)
     memalloc_gil_debug_guard_t guard(gil_guard);
 
     auto node = allocs_m.extract(ptr);
-    if (!node.empty()) {
-        pool_put_no_cpython(std::move(node.mapped()));
+    if (node.empty()) {
+        return;
+    }
+    auto owner = std::move(node.mapped());
+
+    /* Pair-collapse: if a still-pending ADD targets this same ptr, the
+     * allocation was tracked AND freed within this snapshot interval — both
+     * events would aggregate to a zero bucket on the backend anyway, so skip
+     * both and reclaim the traceback immediately. Eliminates the dominant
+     * libdatadog cost in alloc-then-free hot loops.
+     *
+     * Swap-and-pop keeps pending_changes compact instead of leaving the ADD
+     * marked dead in place: the last element moves into the freed slot, and
+     * if it's also a tracked ADD, its index entry in pending_add_idx is
+     * rewritten to point at the new position. Avoids paying the export-time
+     * iteration cost over a buffer that fills up with skipped entries under
+     * sustained alloc/free churn. */
+    auto it = pending_add_idx.find(ptr);
+    if (it != pending_add_idx.end()) {
+        const size_t add_idx = it->second;
+        const size_t last_idx = pending_changes.size() - 1;
+        if (add_idx != last_idx) {
+            pending_changes[add_idx] = std::move(pending_changes[last_idx]);
+            // The moved element kept its own pending_add_idx entry pointing at
+            // last_idx; rewrite it if the moved element was an indexed ADD.
+            if (pending_changes[add_idx].kind == change_event::ADD) {
+                pending_add_idx[pending_changes[add_idx].ptr] = add_idx;
+            }
+        }
+        pending_changes.pop_back();
+        pending_add_idx.erase(it);
+        pool_put_no_cpython(std::move(owner));
+        return;
     }
+
+    /* No pending ADD — the allocation was tracked in an earlier snapshot
+     * interval and the backend already integrated its positive contribution.
+     * Queue a REMOVE so the next export emits a negative tombstone. heap_space
+     * negation is deferred to emit time so any same-snapshot raw-ptr aliasing
+     * sees the original positive value first. */
+    auto* raw = owner.get();
+    pending_changes.push_back({ ptr, raw, std::move(owner), change_event::REMOVE });
 }
 
 bool
@@ -266,11 +361,18 @@ heap_tracker_t::add_sample_no_cpython(void* ptr, std::unique_ptr<traceback_t> tb
     memalloc_gil_debug_guard_t guard(gil_guard);
 
     auto [it, inserted] = allocs_m.insert_or_assign(ptr, std::move(tb));
-    (void)it; // Unused, but needed for structured binding
 
     /* This should always be a new insertion. If not, we failed to properly untrack a previous allocation. */
     assert(inserted && "add_sample: found existing entry for key that should have been removed");
 
+    /* Record ADD event referencing the live traceback. The raw pointer remains
+     * valid until export: either the entry stays in allocs_m, or a subsequent
+     * untrack moves ownership into a REMOVE event in pending_changes — both
+     * are cleared in the same export pass. Index this event in pending_add_idx
+     * so a same-snapshot-interval free can find and collapse it. */
+    pending_changes.push_back({ ptr, it->second.get(), nullptr, change_event::ADD });
+    pending_add_idx[ptr] = pending_changes.size() - 1;
+
     // Get ready for the next sample
     reset_sampling_state_no_cpython();
 }
@@ -280,10 +382,74 @@ heap_tracker_t::export_heap_no_cpython()
 {
     memalloc_gil_debug_guard_t guard(gil_guard);
 
-    /* Iterate over live samples and export them */
-    for (const auto& [ptr, tb] : allocs_m) {
-        (void)ptr; // Suppress unused variable warning
-        tb->sample.export_sample();
+    /* Delta emission: ADDs reference live tracebacks via raw pointer (kept
+     * alive by either allocs_m or a later REMOVE event in this buffer).
+     * REMOVEs own the extracted unique_ptr and emit a negative tombstone;
+     * heap_space is negated lazily here (guarded by tombstone_applied) so an
+     * earlier ADD aliasing the same traceback emits the original positive
+     * value before the REMOVE flips the sign, and so retries on libdatadog
+     * rejection don't toggle the sign each attempt.
+     *
+     * On libdatadog rejection, retain the event for retry on the next snapshot
+     * — without this, an ADD or REMOVE that Profile_add2 refuses is silently
+     * dropped, causing permanent drift in the backend's running heap state.
+     * Bounded by MAX_EXPORT_RETRIES so a persistent rejection cannot grow
+     * pending_changes without bound.
+     *
+     * No periodic resync is emitted today: a full-snapshot would not carry a
+     * wire-format marker telling the backend to reset accumulated state, so
+     * the backend would double-count instead of recovering from drift. If a
+     * delta upload is dropped, the backend's state diverges from live until
+     * the next profiler restart (which clears allocs_m and pending_changes).
+     * Adding a resync requires either backend-side reset signaling or
+     * emitting compensating negative+positive pairs for every live entry —
+     * both are out of scope for v1. */
+
+    /* In-place compaction with a write index. Moving into a fresh vector and
+     * swapping would discard the constructor-side reserve (PENDING_CHANGES_RESERVE)
+     * on every export, forcing the hook path to re-grow the buffer on the next
+     * churn burst. Keeping the same storage preserves capacity. */
+    size_t write_idx = 0;
+    for (size_t read_idx = 0; read_idx < pending_changes.size(); ++read_idx) {
+        auto& evt = pending_changes[read_idx];
+        if (evt.kind == change_event::REMOVE && !evt.tombstone_applied) {
+            evt.tb->sample.negate_heap_space();
+            evt.tombstone_applied = true;
+        }
+        if (evt.tb->sample.export_sample()) {
+            // Success: recycle the REMOVE traceback; ADDs leave their
+            // tracebacks in allocs_m where they were.
+            if (evt.kind == change_event::REMOVE) {
+                pool_put_no_cpython(std::move(evt.owner));
+            }
+            continue;
+        }
+        // libdatadog rejected: retry next snapshot, up to MAX_EXPORT_RETRIES.
+        if (evt.failed_attempts >= MAX_EXPORT_RETRIES) {
+            if (evt.kind == change_event::REMOVE) {
+                pool_put_no_cpython(std::move(evt.owner));
+            }
+            continue;
+        }
+        ++evt.failed_attempts;
+        if (write_idx != read_idx) {
+            pending_changes[write_idx] = std::move(evt);
+        }
+        ++write_idx;
+    }
+    pending_changes.resize(write_idx);
+    /* Rebuild the side map for the compacted pending_changes vector. The old
+     * map's indices referenced positions before compaction; without rebuilding,
+     * retained ADDs would lose their pair-collapse capability on the next
+     * snapshot (a REMOVE arriving for one of these ptrs would queue a tombstone
+     * instead of canceling the still-unsuccessful ADD). The cost is O(write_idx),
+     * which is bounded — retained events only appear on libdatadog rejection
+     * and are capped at MAX_EXPORT_RETRIES per event. */
+    pending_add_idx.clear();
+    for (size_t i = 0; i < pending_changes.size(); ++i) {
+        if (pending_changes[i].kind == change_event::ADD) {
+            pending_add_idx[pending_changes[i].ptr] = i;
+        }
     }
 
     Datadog::Sample::profile_borrow().stats().set_heap_tracker_size(allocs_m.size());
@@ -310,6 +476,12 @@ heap_tracker_t::postfork_child()
     // Profile::postfork_child()
     pool.clear();
 
+    // Pending delta events may contain raw pointers into allocs_m and owned
+    // tracebacks; clear before dropping allocs_m so the raw pointers in any
+    // ADD events do not get stranded. Side map indices alias these events.
+    pending_changes.clear();
+    pending_add_idx.clear();
+
     // Allocations map may contain data from the parent process, and also
     // traceback_t objects may reference invalid Profile state.
     allocs_m.clear();