diff --git a/.gitignore b/.gitignore
index 0f356338..dba32f05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@
 #
 # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
 
+
+.DS_Store
+
 ## User settings
 xcuserdata/
 
@@ -57,6 +60,7 @@ Package.resolved
 .swiftpm
 .DS_Store
 .build/
+.build-*/
 
 # CocoaPods
 #
diff --git a/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift b/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift
new file mode 100644
index 00000000..eb0ad5f0
--- /dev/null
+++ b/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift
@@ -0,0 +1,202 @@
+//
+// Copyright (c) 2026 Ordo One AB
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Regression benchmarks for the malloc interposer. Each benchmark performs
+// a known, fixed number of allocations per iteration so the reported
+// per-iteration counts (mallocCountTotal / freeCountTotal / etc.) line up
+// with the expected values noted in the benchmark name. Drift between the
+// jemalloc and interposer code paths — or between branches — shows up
+// immediately as a count mismatch.
+//
+// Counts are scaled per iteration: with .kilo scaling, one malloc inside
+// the body produces "1" in the count column, not "1000".
+
+import Benchmark
+
+#if canImport(Darwin)
+import Darwin
+#elseif canImport(Glibc)
+import Glibc
+#elseif canImport(Musl)
+import Musl
+#else
+#error("Unsupported Platform")
+#endif
+
+let mallocMetrics: [BenchmarkMetric] = [
+    .wallClock,
+    .mallocCountSmall,
+    .mallocCountLarge,
+    .mallocCountTotal,
+    .freeCountTotal,
+    .mallocBytesCount,
+    .mallocFreeDelta,
+    .memoryLeakedBytes,
+]
+
+let benchmarks: @Sendable () -> Void = {
+    Benchmark.defaultConfiguration = .init(
+        metrics: mallocMetrics,
+        warmupIterations: 1,
+        scalingFactor: .kilo,
+        maxDuration: .seconds(1),
+        maxIterations: 100
+    )
+
+    // Sanity floor: an empty body should report (close to) zero allocations.
+    // Whatever the framework's per-iteration overhead is, it shows up here
+    // and is the reference for what "no allocations" looks like.
+    Benchmark("Noop") { benchmark in
+        for _ in benchmark.scaledIterations {
+            blackHole(0)
+        }
+    }
+
+    // Bread-and-butter malloc/free pair, sub-page size — should land in
+    // mallocCountSmall, not mallocCountLarge.
+    //   Expected per iter: malloc=1 (small=1, large=0), free=1, leaked=0.
+    Benchmark("Malloc 64B + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = malloc(64)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+
+    // Larger-than-page allocation — should land in mallocCountLarge.
+    //   Expected per iter: malloc=1 (small=0, large=1), free=1.
+    Benchmark("Malloc 2 MiB + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = malloc(2 * 1_024 * 1_024)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+
+    // calloc must be counted exactly like malloc + memset.
+    //   Expected per iter: malloc=1, free=1.
+    Benchmark("Calloc 8x8 + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = calloc(8, 8)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+
+    // realloc(grow) on success: implicit free of old + alloc of new.
+    //   Expected per iter: malloc=2, free=2.
+    Benchmark("Realloc grow 64→256 + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let original = malloc(64)
+            let grown = realloc(original, 256)
+            blackHole(grown)
+            free(grown)
+        }
+    }
+
+    // realloc(NULL, size) is a pure malloc — no implicit free.
+    //   Expected per iter: malloc=1, free=1.
+    Benchmark("Realloc(NULL, 128) + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = realloc(nil, 128)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+
+    // realloc(p, 0) frees p and returns NULL — pure free, no second malloc.
+    //   Expected per iter: malloc=1, free=1.
+    Benchmark("Malloc + realloc(p, 0)") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = malloc(64)
+            let resized = realloc(ptr, 0)
+            blackHole(resized) // expected nil
+        }
+    }
+
+    // posix_memalign — separate code path that's easy to forget to count.
+    //   Expected per iter: malloc=1, free=1.
+    Benchmark("posix_memalign(64, 1024) + free") { benchmark in
+        var ptr: UnsafeMutableRawPointer?
+        for _ in benchmark.scaledIterations {
+            _ = posix_memalign(&ptr, 64, 1_024)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+
+    // C11 aligned_alloc — currently only intercepted on Linux. On Darwin the
+    // count drops because the symbol isn't in the DYLD_INTERPOSE list. Useful
+    // signal for that gap.
+    //   Expected per iter (Linux): malloc=1, free=1.
+    //   Expected per iter (Darwin): malloc=0 (not interposed), free=1.
+    #if !canImport(Darwin)
+    Benchmark("aligned_alloc(64, 1024) + free") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = aligned_alloc(64, 1_024)
+            blackHole(ptr)
+            free(ptr)
+        }
+    }
+    #endif
+
+    // Batched mallocs in a single iteration — verifies the counter scales
+    // linearly and isn't accidentally collapsed/de-duplicated.
+    //   Expected per iter: malloc=16, free=16.
+    Benchmark("Malloc x16 + free x16") { benchmark in
+        let count = 16
+        let buf = UnsafeMutablePointer<UnsafeMutableRawPointer?>.allocate(capacity: count)
+        defer { buf.deallocate() }
+        buf.update(repeating: nil, count: count)
+
+        for _ in benchmark.scaledIterations {
+            for i in 0..<count {
+                buf[i] = malloc(48)
+            }
+            for i in 0..<count {
+                free(buf[i])
+            }
+        }
+    }
+
+    // Deliberate leak: malloc without free. Confirms mallocFreeDelta /
+    // memoryLeakedBytes track unbalanced flow correctly.
+    //   Expected per iter: malloc=1, free=0, leaked=1, leakedBytes≈128.
+    // The accumulated leak across the run is bounded:
+    //   <= maxIterations * scalingFactor * 128 = 100 * 1000 * 128 = ~12.5 MiB.
+    Benchmark("Leak: malloc 128B (no free)") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let ptr = malloc(128)
+            blackHole(ptr)
+        }
+    }
+
+    // Swift stdlib path: Array(repeating:count:) goes through swift_allocObject
+    // which (on supported platforms) lowers to malloc. The exact count per
+    // iter depends on stdlib internals, but it must be > 0 and stable
+    // between runs.
+    Benchmark("Swift Array<Int>(repeating:0, count:128)") { benchmark in
+        for _ in benchmark.scaledIterations {
+            var arr = [Int](repeating: 0, count: 128)
+            arr.withUnsafeMutableBufferPointer { buf in
+                blackHole(buf.baseAddress)
+            }
+        }
+    }
+
+    // Heap-allocated String (must exceed the small-string inline limit of
+    // 15 bytes). Same caveat as Array — count is stdlib-dependent but must
+    // be stable.
+    Benchmark("Swift String (long, heap)") { benchmark in
+        for _ in benchmark.scaledIterations {
+            let str = String(repeating: "x", count: 256)
+            blackHole(str)
+        }
+    }
+}
diff --git a/Benchmarks/Package.resolved b/Benchmarks/Package.resolved
index 86e322d3..2c543468 100644
--- a/Benchmarks/Package.resolved
+++ b/Benchmarks/Package.resolved
@@ -1,5 +1,5 @@
 {
-  "originHash" : "f1d359a544b71b52c6788ad2e4cd2952f7f166b62ddb07316768f66be7ba4099",
+  "originHash" : "beddb8cb97cf892b8a2c00081488d118648e1609b6467ece2ea9cd075a22b282",
   "pins" : [
     {
       "identity" : "hdrhistogram-swift",
@@ -11,21 +11,21 @@
       }
     },
     {
-      "identity" : "package-datetime",
+      "identity" : "malloc-interposer",
       "kind" : "remoteSourceControl",
-      "location" : "https://github.com/ordo-one/package-datetime",
+      "location" : "https://github.com/ordo-one/malloc-interposer.git",
       "state" : {
-        "revision" : "d1242188c9f48aad297e6ca9b717776f8660bc31",
-        "version" : "1.0.2"
+        "revision" : "d9ca5ad6d85622fb2bd5b3d3387ba064dbcab1c2",
+        "version" : "1.0.0"
       }
     },
     {
-      "identity" : "package-jemalloc",
+      "identity" : "package-datetime",
       "kind" : "remoteSourceControl",
-      "location" : "https://github.com/ordo-one/package-jemalloc",
+      "location" : "https://github.com/ordo-one/package-datetime",
       "state" : {
-        "revision" : "e8a5db026963f5bfeac842d9d3f2cc8cde323b49",
-        "version" : "1.0.0"
+        "revision" : "d1242188c9f48aad297e6ca9b717776f8660bc31",
+        "version" : "1.0.2"
       }
     },
     {
@@ -33,8 +33,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-argument-parser",
       "state" : {
-        "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
-        "version" : "1.3.0"
+        "revision" : "6a52f3251125d74daf04fcbd5e6f08a75d074382",
+        "version" : "1.8.2"
       }
     },
     {
diff --git a/Benchmarks/Package.swift b/Benchmarks/Package.swift
index 7f75a1fd..ec44ccb0 100644
--- a/Benchmarks/Package.swift
+++ b/Benchmarks/Package.swift
@@ -77,3 +77,20 @@ package.targets += [
         ]
     )
 ]
+
+// Regression coverage for the malloc interposer: predictable allocation
+// patterns (counts known per iteration) so any drift between jemalloc and
+// interposer code paths is immediately visible in mallocCountTotal /
+// freeCountTotal / mallocFreeDelta / memoryLeakedBytes.
+package.targets += [
+    .executableTarget(
+        name: "MallocInterposerBenchmarks",
+        dependencies: [
+            .product(name: "Benchmark", package: "benchmark")
+        ],
+        path: "Benchmarks/MallocInterposer",
+        plugins: [
+            .plugin(name: "BenchmarkPlugin", package: "benchmark")
+        ]
+    )
+]
diff --git a/Package.swift b/Package.swift
index 0a48e622..71c4c543 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,19 +1,23 @@
-// swift-tools-version: 6.1
+// swift-tools-version: 6.3
 
 import PackageDescription
 
 import class Foundation.ProcessInfo
 
-// If the environment variable BENCHMARK_DISABLE_JEMALLOC is set disable Jemalloc trait (backward compatibility)
-let disableJemalloc = ProcessInfo.processInfo.environment["BENCHMARK_DISABLE_JEMALLOC"] != nil
-
-let defaultTraits: Set<String>
-
-if disableJemalloc {
-    defaultTraits = []
-} else {
-    defaultTraits = ["Jemalloc"]
-}
+// When MALLOC_INTERPOSER_LOCAL_PATH is set, use a local checkout of the
+// malloc-interposer package instead of the published GitHub URL. Useful
+// when iterating on the interposer alongside this package.
+let mallocInterposerDependency: Package.Dependency = {
+    if let localPath = ProcessInfo.processInfo.environment["MALLOC_INTERPOSER_LOCAL_PATH"],
+        localPath.isEmpty == false
+    {
+        return .package(path: localPath)
+    }
+    return .package(
+        url: "https://github.com/ordo-one/malloc-interposer.git",
+        .upToNextMajor(from: "1.0.0")
+    )
+}()
 
 var packageDependencies: [Package.Dependency] = [
     .package(url: "https://github.com/apple/swift-system.git", .upToNextMajor(from: "1.1.0")),
@@ -21,12 +25,12 @@ var packageDependencies: [Package.Dependency] = [
     .package(url: "https://github.com/ordo-one/TextTable.git", .upToNextMajor(from: "0.0.1")),
     .package(url: "https://github.com/HdrHistogram/hdrhistogram-swift.git", .upToNextMajor(from: "0.1.4")),
     .package(url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.0.0")),
-    .package(url: "https://github.com/ordo-one/package-jemalloc.git", .upToNextMajor(from: "1.0.0")),
+    mallocInterposerDependency,
 ]
 
 #if os(Linux) && compiler(>=6.3)
 packageDependencies += [
-    .package(url: "https://github.com/ordo-one/swift-runtime-interposer.git", .upToNextMajor(from: "1.0.0")),
+    .package(url: "https://github.com/ordo-one/swift-runtime-interposer.git", .upToNextMajor(from: "1.0.0"))
 ]
 #endif
 
@@ -39,13 +43,21 @@ var benchmarkDependencies: [Target.Dependency] = [
     .product(name: "Atomics", package: "swift-atomics"),
     "SwiftRuntimeHooks",
     "BenchmarkShared",
-    .product(name: "jemalloc", package: "package-jemalloc", condition: .when(platforms: [.macOS, .linux], traits: ["Jemalloc"])),
+    .product(name: "MallocInterposerSwift", package: "malloc-interposer"),
 ]
 
 #if os(Linux) && compiler(>=6.3)
 benchmarkDependencies += [
-    .product(name: "SwiftRuntimeInterposerC", package: "swift-runtime-interposer", condition: .when(platforms: [.linux])),
-    .product(name: "SwiftRuntimeInterposerSwift", package: "swift-runtime-interposer", condition: .when(platforms: [.linux])),
+    .product(
+        name: "SwiftRuntimeInterposerC",
+        package: "swift-runtime-interposer",
+        condition: .when(platforms: [.linux])
+    ),
+    .product(
+        name: "SwiftRuntimeInterposerSwift",
+        package: "swift-runtime-interposer",
+        condition: .when(platforms: [.linux])
+    ),
 ]
 #endif
 
@@ -63,10 +75,6 @@ let package = Package(
             targets: ["Benchmark"]
         ),
     ],
-    traits: [
-        .trait(name: "Jemalloc"),
-        .default(enabledTraits: defaultTraits),
-    ],
     dependencies: packageDependencies,
     targets: [
         .target(
diff --git a/Package@swift-6.2.swift b/Package@swift-6.2.swift
new file mode 100644
index 00000000..2f8c1f0a
--- /dev/null
+++ b/Package@swift-6.2.swift
@@ -0,0 +1,148 @@
+// swift-tools-version: 6.1
+
+import PackageDescription
+
+import class Foundation.ProcessInfo
+
+// If the environment variable BENCHMARK_DISABLE_JEMALLOC is set disable Jemalloc trait (backward compatibility)
+let disableJemalloc = ProcessInfo.processInfo.environment["BENCHMARK_DISABLE_JEMALLOC"] != nil
+
+let defaultTraits: Set<String>
+
+if disableJemalloc {
+    defaultTraits = []
+} else {
+    defaultTraits = ["Jemalloc"]
+}
+
+let package = Package(
+    name: "Benchmark",
+    platforms: [
+        .macOS(.v13),
+        .iOS(.v16),
+    ],
+    products: [
+        .plugin(name: "BenchmarkCommandPlugin", targets: ["BenchmarkCommandPlugin"]),
+        .plugin(name: "BenchmarkPlugin", targets: ["BenchmarkPlugin"]),
+        .library(
+            name: "Benchmark",
+            targets: ["Benchmark"]
+        ),
+    ],
+    traits: [
+        .trait(name: "Jemalloc"),
+        .default(enabledTraits: defaultTraits),
+    ],
+    dependencies: [
+        .package(url: "https://github.com/apple/swift-system.git", .upToNextMajor(from: "1.1.0")),
+        .package(url: "https://github.com/apple/swift-argument-parser.git", "1.1.0"..<"1.6.0"),
+        .package(url: "https://github.com/ordo-one/TextTable.git", .upToNextMajor(from: "0.0.1")),
+        .package(url: "https://github.com/HdrHistogram/hdrhistogram-swift.git", .upToNextMajor(from: "0.1.4")),
+        .package(url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.0.0")),
+        .package(url: "https://github.com/ordo-one/package-jemalloc.git", .upToNextMajor(from: "1.0.0")),
+    ],
+    targets: [
+        .target(
+            name: "Benchmark",
+            dependencies: [
+                .product(name: "Histogram", package: "hdrhistogram-swift"),
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+                .product(name: "SystemPackage", package: "swift-system"),
+                .byNameItem(name: "CDarwinOperatingSystemStats", condition: .when(platforms: [.macOS, .iOS])),
+                .byNameItem(name: "CLinuxOperatingSystemStats", condition: .when(platforms: [.linux])),
+                .product(name: "Atomics", package: "swift-atomics"),
+                "SwiftRuntimeHooks",
+                "BenchmarkShared",
+                .product(
+                    name: "jemalloc", package: "package-jemalloc", condition: .when(platforms: [.macOS, .linux], traits: ["Jemalloc"])),
+            ],
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
+        // Plugins used by users of the package
+
+        // The actual 'benchmark' command plugin
+        .plugin(
+            name: "BenchmarkCommandPlugin",
+            capability: .command(
+                intent: .custom(
+                    verb: "benchmark",
+                    description: "Run the Benchmark performance test suite."
+                )
+            ),
+            dependencies: [
+                "BenchmarkTool"
+            ],
+            path: "Plugins/BenchmarkCommandPlugin"
+        ),
+
+        // Plugin that generates the boilerplate needed to interface with the Benchmark infrastructure
+        .plugin(
+            name: "BenchmarkPlugin",
+            capability: .buildTool(),
+            dependencies: [
+                "BenchmarkBoilerplateGenerator"
+            ],
+            path: "Plugins/BenchmarkPlugin"
+        ),
+
+        // Tool that the plugin executes to perform the actual work, the real benchmark driver
+        .executableTarget(
+            name: "BenchmarkTool",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+                .product(name: "SystemPackage", package: "swift-system"),
+                .product(name: "TextTable", package: "TextTable"),
+                "Benchmark",
+                "BenchmarkShared",
+            ],
+            path: "Plugins/BenchmarkTool",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
+
+        // Tool that generates the boilerplate
+        .executableTarget(
+            name: "BenchmarkBoilerplateGenerator",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+                .product(name: "SystemPackage", package: "swift-system"),
+            ],
+            path: "Plugins/BenchmarkBoilerplateGenerator"
+        ),
+
+        // Tool that simply generates the man page for the BenchmarkPlugin as we can't use SAP in it... :-/
+        .executableTarget(
+            name: "BenchmarkHelpGenerator",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+                "BenchmarkShared",
+            ],
+            path: "Plugins/BenchmarkHelpGenerator"
+        ),
+
+        // Getting OS specific information
+        .target(
+            name: "CDarwinOperatingSystemStats",
+            dependencies: [],
+            path: "Platform/CDarwinOperatingSystemStats"
+        ),
+
+        // Getting OS specific information
+        .target(
+            name: "CLinuxOperatingSystemStats",
+            dependencies: [],
+            path: "Platform/CLinuxOperatingSystemStats"
+        ),
+
+        // Hooks for ARC
+        .target(name: "SwiftRuntimeHooks"),
+
+        // Shared definitions
+        .target(name: "BenchmarkShared"),
+
+        .testTarget(
+            name: "BenchmarkTests",
+            dependencies: ["Benchmark"],
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
+    ]
+)
diff --git a/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift b/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift
index 22650d62..ff7f7ab2 100644
--- a/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift
+++ b/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift
@@ -11,6 +11,7 @@
 // 'Benchmark' plugin that is responsible for gathering command line arguments and then
 // Running the `BenchmarkTool` for each benchmark target.
 
+@preconcurrency import Foundation
 import PackagePlugin
 @preconcurrency import Foundation
 
@@ -174,6 +175,7 @@ import PackagePlugin
         let packageBenchmarkIdentifiers: Set<String> = ["benchmark", "package-benchmark"]
         let benchmarkToolName = "BenchmarkTool"
         let benchmarkTool: PackagePlugin.Path // = try context.tool(named: benchmarkToolName)
+        let interposerLib: String
 
         // Resolve which identifier this consumer actually has the benchmark package under,
         // so generated boilerplate matches what SPM sees (depends on whether they pinned
@@ -419,10 +421,7 @@ import PackagePlugin
         }
 
         // Build the BenchmarkTool manually in release mode to work around https://github.com/apple/swift-package-manager/issues/7210
-        guard
-            let benchmarkToolModule = benchmarkToolModuleTargets.first(where: {
-                $0.kind == .executable && $0.name == benchmarkToolName
-            })
+        guard let benchmarkToolModule = benchmarkToolModuleTargets.first(where: { $0.kind == .executable && $0.name == benchmarkToolName })
         else {
             print("Benchmark failed to find the BenchmarkTool target.")
             throw MyError.buildFailed
@@ -457,6 +456,7 @@ import PackagePlugin
         }
 
         benchmarkTool = tool.path
+        interposerLib = tool.path.removingLastComponent().appending(subpath: "libMallocInterposerSwift.so").string
         #if os(Linux) && compiler(>=6.3)
         let swiftRuntimeInterposerLib = tool.path.removingLastComponent()
             .appending(subpath: "libSwiftRuntimeInterposerC.so").string
@@ -542,6 +542,8 @@ import PackagePlugin
                 return
             }
 
+            // On Linux we need to set LD_PRELOAD to get the malloc interposer working
+            // while on Darwin this is done with DYLD interpose mechanism
             #if os(Linux) && compiler(>=6.3)
             if shouldEmitRuntimeInterposerWarning(outputFormat: outputFormat, exportPath: exportPath) {
                 writeToStderr(
@@ -551,9 +553,9 @@ import PackagePlugin
 
             var environment = ProcessInfo.processInfo.environment
             if let existingPreload = environment["LD_PRELOAD"], existingPreload.isEmpty == false {
-                environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(existingPreload)"
+                environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(interposerLib):\(existingPreload)"
             } else {
-                environment["LD_PRELOAD"] = swiftRuntimeInterposerLib
+                environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(interposerLib)"
             }
 
             let envp = environment.map { "\($0.key)=\($0.value)" }.compactMap { $0.withCString(strdup) } + [nil]
diff --git a/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift b/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift
index 8c91fac0..f09c4c69 100644
--- a/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift
+++ b/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift
@@ -53,8 +53,9 @@ let help =
                           Benchmark targets matching the regexp filter that should be skipped
     --format <format>       The output format to use, default is 'text' (values: text, markdown, influx, jmh, jsonSmallerIsBetter, jsonBiggerIsBetter, histogramEncoded, histogram, histogramSamples, histogramPercentiles, metricP90AbsoluteThresholds)
     --metric <metric>       Specifies that the benchmark run should use one or more specific metrics instead of the ones defined by the benchmarks. (values: cpuUser, cpuSystem, cpuTotal, wallClock, throughput,
-                          peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, allocatedResidentMemory, memoryLeaked, syscalls, contextSwitches, threads,
-                          threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom)
+                          peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, freeCountTotal, mallocBytesCount, mallocFreeDelta,
+                          allocatedResidentMemory, memoryLeaked, memoryLeakedBytes, syscalls, contextSwitches, threads, threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical,
+                          readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom)
     --path <path>           The path to operate on for data export or threshold operations, default is the current directory (".") for exports and the ("./Thresholds") directory for thresholds.
     --quiet                 Specifies that output should be suppressed (useful for if you just want to check return code)
     --scale                 Specifies that some of the text output should be scaled using the scalingFactor (denoted by '*' in output)
diff --git a/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift b/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift
index 979daee2..5cc9920b 100644
--- a/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift
+++ b/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift
@@ -26,8 +26,12 @@ let availableMetrics = [
     "mallocCountSmall",
     "mallocCountLarge",
     "mallocCountTotal",
+    "freeCountTotal",
+    "mallocBytesCount",
+    "mallocFreeDelta",
     "allocatedResidentMemory",
     "memoryLeaked",
+    "memoryLeakedBytes",
     "syscalls",
     "contextSwitches",
     "threads",
diff --git a/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift b/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift
index 5529d71f..79e3f678 100644
--- a/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift
+++ b/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift
@@ -31,7 +31,7 @@ extension BenchmarkTool {
             let benchmarkReply = try read()
 
             switch benchmarkReply {
-            case let .list(benchmark):
+            case .list(let benchmark):
                 benchmark.executablePath = benchmarkPath
                 benchmark.target = FilePath(benchmarkPath).lastComponent!.description
                 if metrics.isEmpty == false {
@@ -40,7 +40,7 @@ extension BenchmarkTool {
                 benchmarks.append(benchmark)
             case .end:
                 break outerloop
-            case let .error(description):
+            case .error(let description):
                 failBenchmark(description)
                 break outerloop
             default:
@@ -57,12 +57,12 @@ extension BenchmarkTool {
             let benchmarkReply = try read()
 
             switch benchmarkReply {
-            case let .result(benchmark: benchmark, results: results):
+            case .result(benchmark: let benchmark, results: let results):
                 let filteredResults = results.filter { benchmark.configuration.metrics.contains($0.metric) }
                 benchmarkResults[BenchmarkIdentifier(target: target, name: benchmark.name)] = filteredResults
             case .end:
                 break outerloop
-            case let .error(description):
+            case .error(let description):
                 failBenchmark(description, exitCode: .benchmarkJobFailed, "\(target)/\(benchmark.name)")
 
                 benchmarkResults[BenchmarkIdentifier(target: target, name: benchmark.name)] = []
diff --git a/Sources/Benchmark/BenchmarkExecutor+Extensions.swift b/Sources/Benchmark/BenchmarkExecutor+Extensions.swift
index 559741f2..1a9d54fe 100644
--- a/Sources/Benchmark/BenchmarkExecutor+Extensions.swift
+++ b/Sources/Benchmark/BenchmarkExecutor+Extensions.swift
@@ -23,22 +23,66 @@ extension BenchmarkExecutor {
 extension BenchmarkExecutor {
     func mallocStatsProducerNeeded(_ metric: BenchmarkMetric) -> Bool {
         switch metric {
-        case .mallocCountLarge:
-            return true
         case .memoryLeaked:
+            #if canImport(MallocInterposerSwift)
+            return false
+            #else
             return true
-        case .mallocCountSmall:
+            #endif
+        case .memoryLeakedBytes:
             return true
+        case .mallocFreeDelta:
+            #if canImport(MallocInterposerSwift)
+            return true
+            #else
+            return false
+            #endif
         case .mallocCountTotal:
             return true
+        case .mallocCountSmall:
+            return true
+        case .mallocCountLarge:
+            return true
+        case .mallocBytesCount:
+            return true
         case .allocatedResidentMemory:
             return true
+        case .freeCountTotal:
+            return true
         default:
             return false
         }
     }
 }
 
+extension BenchmarkExecutor {
+    /// Maps a measured window's interposer counter deltas to the `(metric, value)` pairs to record.
+    ///
+    /// Extracted as a pure function so the leak/scaling arithmetic can be unit-tested without a live
+    /// interposer. `mallocFreeDelta` / `memoryLeakedBytes` are clamped to `0`: a net-negative window
+    /// (more frees than mallocs — e.g. freeing a warmup survivor, or cross-thread frees) is not a
+    /// leak, and clamping records a `0` sample rather than letting `Statistics.add` drop it, which
+    /// would desync the column's sample count and bias the average upward.
+    static func mallocStatistics( // swiftlint:disable:this function_parameter_count
+        mallocCountDelta: Int,
+        mallocBytesDelta: Int,
+        mallocSmallDelta: Int,
+        mallocLargeDelta: Int,
+        freeCountDelta: Int,
+        freeBytesDelta: Int
+    ) -> [(metric: BenchmarkMetric, value: Int)] {
+        [
+            (.mallocCountTotal, mallocCountDelta),
+            (.mallocBytesCount, mallocBytesDelta),
+            (.mallocCountSmall, mallocSmallDelta),
+            (.mallocCountLarge, mallocLargeDelta),
+            (.freeCountTotal, freeCountDelta),
+            (.mallocFreeDelta, max(0, mallocCountDelta - freeCountDelta)),
+            (.memoryLeakedBytes, max(0, mallocBytesDelta - freeBytesDelta)),
+        ]
+    }
+}
+
 extension BenchmarkExecutor {
     func operatingSystemsStatsProducerNeeded(_ metric: BenchmarkMetric) -> Bool {
         switch metric {
diff --git a/Sources/Benchmark/BenchmarkExecutor.swift b/Sources/Benchmark/BenchmarkExecutor.swift
index 99e07086..ead3edc1 100644
--- a/Sources/Benchmark/BenchmarkExecutor.swift
+++ b/Sources/Benchmark/BenchmarkExecutor.swift
@@ -8,6 +8,10 @@
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 
+#if canImport(MallocInterposerSwift)
+import MallocInterposerSwift
+#endif
+
 #if canImport(OSLog)
 import OSLog
 #endif
@@ -25,8 +29,13 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
     // swiftlint:disable cyclomatic_complexity function_body_length
     func run(_ benchmark: Benchmark) -> [BenchmarkResult] {
         var wallClockDuration: Duration = .zero
+        #if canImport(MallocInterposerSwift)
+        var startMallocStats = MallocInterposerSwift.Statistics()
+        var stopMallocStats = MallocInterposerSwift.Statistics()
+        #else
         var startMallocStats = MallocStats()
         var stopMallocStats = MallocStats()
+        #endif
         var startOperatingSystemStats = OperatingSystemStats()
         var stopOperatingSystemStats = OperatingSystemStats()
         var startPerformanceCounters = PerformanceCounters()
@@ -106,9 +115,6 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
         var iterations = 0
         let initialStartTime = BenchmarkClock.now
 
-        // 'Warmup' to remove initial mallocs from stats in p100
-        _ = MallocStatsProducer.makeMallocStats() // baselineMallocStats
-
         // Calculate typical sys call check overhead and deduct that to get 'clean' stats for the actual benchmark
         var operatingSystemStatsOverhead = OperatingSystemStats()
         var baselinePeakMemoryResidentDelta = 0
@@ -154,7 +160,11 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
             #endif
 
             if mallocStatsRequested {
+                #if canImport(MallocInterposerSwift)
+                startMallocStats = MallocInterposerSwift.getStatistics()
+                #else
                 startMallocStats = MallocStatsProducer.makeMallocStats()
+                #endif
             }
 
             if arcStatsRequested {
@@ -191,7 +201,11 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
             }
 
             if mallocStatsRequested {
+                #if canImport(MallocInterposerSwift)
+                stopMallocStats = MallocInterposerSwift.getStatistics()
+                #else
                 stopMallocStats = MallocStatsProducer.makeMallocStats()
+                #endif
             }
 
             #if canImport(OSLog)
@@ -239,21 +253,43 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
                 }
 
                 if mallocStatsRequested {
-                    delta = stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal
-                    statistics[BenchmarkMetric.mallocCountTotal.index].add(Int(delta))
+                    #if canImport(MallocInterposerSwift)
+                    // allocatedResidentMemory and the legacy memoryLeaked metric are intentionally
+                    // not populated on the interposer path: the interposer cannot measure the
+                    // allocator's resident set. Use mallocBytesCount / memoryLeakedBytes for
+                    // requested-byte accounting, mallocFreeDelta for allocation-count delta, or
+                    // peakMemoryResident for OS-sampled resident memory. The leak/scaling arithmetic
+                    // lives in BenchmarkExecutor.mallocStatistics(...) so it can be unit-tested
+                    // without a live interposer.
+                    let mallocMetrics = BenchmarkExecutor.mallocStatistics(
+                        mallocCountDelta: stopMallocStats.mallocCount - startMallocStats.mallocCount,
+                        mallocBytesDelta: stopMallocStats.mallocBytesCount - startMallocStats.mallocBytesCount,
+                        mallocSmallDelta: stopMallocStats.mallocSmallCount - startMallocStats.mallocSmallCount,
+                        mallocLargeDelta: stopMallocStats.mallocLargeCount - startMallocStats.mallocLargeCount,
+                        freeCountDelta: stopMallocStats.freeCount - startMallocStats.freeCount,
+                        freeBytesDelta: stopMallocStats.freeBytesCount - startMallocStats.freeBytesCount
+                    )
+                    for (metric, value) in mallocMetrics {
+                        statistics[metric.index].add(value)
+                    }
+                    #else
+                    let mallocCountTotal = stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal
+                    statistics[BenchmarkMetric.mallocCountTotal.index].add(mallocCountTotal)
 
-                    delta = stopMallocStats.mallocCountSmall - startMallocStats.mallocCountSmall
-                    statistics[BenchmarkMetric.mallocCountSmall.index].add(Int(delta))
+                    let allocatedResidentMemory = stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory
+                    statistics[BenchmarkMetric.allocatedResidentMemory.index].add(allocatedResidentMemory)
 
-                    delta = stopMallocStats.mallocCountLarge - startMallocStats.mallocCountLarge
-                    statistics[BenchmarkMetric.mallocCountLarge.index].add(Int(delta))
+                    // jemalloc has no free counter, so memoryLeaked keeps the legacy resident-byte
+                    // growth definition. The interposer backend uses mallocFreeDelta for
+                    // malloc-minus-free count and memoryLeakedBytes for requested-byte delta.
+                    statistics[BenchmarkMetric.memoryLeaked.index].add(max(0, allocatedResidentMemory))
 
-                    delta = stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory
-                    statistics[BenchmarkMetric.memoryLeaked.index].add(Int(delta))
+                    let mallocSmallCount = stopMallocStats.mallocCountSmall - startMallocStats.mallocCountSmall
+                    statistics[BenchmarkMetric.mallocCountSmall.index].add(mallocSmallCount)
 
-                    //                delta = stopMallocStats.allocatedResidentMemory - baselineMallocStats.allocatedResidentMemory // baselineMallocStats!
-                    statistics[BenchmarkMetric.allocatedResidentMemory.index]
-                        .add(Int(stopMallocStats.allocatedResidentMemory))
+                    let mallocLargeCount = stopMallocStats.mallocCountLarge - startMallocStats.mallocCountLarge
+                    statistics[BenchmarkMetric.mallocCountLarge.index].add(mallocLargeCount)
+                    #endif
                 }
 
                 if operatingSystemStatsRequested {
@@ -335,6 +371,12 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
             ARCStatsProducer.hook()
         }
 
+        if mallocStatsRequested {
+            #if canImport(MallocInterposerSwift)
+            MallocInterposerSwift.hook()
+            #endif
+        }
+
         if benchmark.configuration.metrics.contains(.threads)
             || benchmark.configuration.metrics.contains(.threadsRunning)
             || benchmark.configuration.metrics.contains(.peakMemoryResident)
@@ -425,6 +467,12 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length
             ARCStatsProducer.unhook()
         }
 
+        if mallocStatsRequested {
+            #if canImport(MallocInterposerSwift)
+            MallocInterposerSwift.unhook()
+            #endif
+        }
+
         #if canImport(OSLog)
         signPost.endInterval("Benchmark", benchmarkInterval, "\(iterations)")
         #endif
diff --git a/Sources/Benchmark/BenchmarkMetric+Defaults.swift b/Sources/Benchmark/BenchmarkMetric+Defaults.swift
index 6ec46b88..8828adbc 100644
--- a/Sources/Benchmark/BenchmarkMetric+Defaults.swift
+++ b/Sources/Benchmark/BenchmarkMetric+Defaults.swift
@@ -30,43 +30,84 @@ public extension BenchmarkMetric {
     /// There is also an convenience extension on Array defined such that you can write just `.default` rather than `BenchmarkMetric.default`
     ///
     static var `default`: [BenchmarkMetric] {
-        [
+        var metrics: [BenchmarkMetric] = [
             .wallClock,
             .cpuTotal,
             .mallocCountTotal,
+        ]
+        #if canImport(MallocInterposerSwift)
+        metrics += [
+            .freeCountTotal,
+            .mallocBytesCount,
+            .mallocFreeDelta,
+            .memoryLeakedBytes,
+        ]
+        #else
+        metrics += [
+            .memoryLeaked,
+        ]
+        #endif
+        metrics += [
             .throughput,
             .instructions,
             .peakMemoryResident,
         ]
+        return metrics
     }
 
     /// A collection of extended system benchmarks.
     static var extended: [BenchmarkMetric] {
-        [
+        var metrics: [BenchmarkMetric] = [
             .wallClock,
             .cpuUser,
             .cpuTotal,
             .mallocCountTotal,
+        ]
+        #if canImport(MallocInterposerSwift)
+        metrics += [
+            .freeCountTotal,
+            .mallocBytesCount,
+            .mallocFreeDelta,
+            .memoryLeakedBytes,
+        ]
+        #else
+        metrics += [
+            .memoryLeaked,
+        ]
+        #endif
+        metrics += [
             .throughput,
             .peakMemoryResident,
-            .memoryLeaked,
             .syscalls,
             .instructions,
         ]
+        return metrics
     }
 
     /// A collection of memory benchmarks.
     static var memory: [BenchmarkMetric] {
-        [
+        var metrics: [BenchmarkMetric] = [
             .peakMemoryResident,
             .peakMemoryResidentDelta,
             .peakMemoryVirtual,
             .mallocCountSmall,
             .mallocCountLarge,
             .mallocCountTotal,
+        ]
+        #if canImport(MallocInterposerSwift)
+        metrics += [
+            .mallocBytesCount,
+            .freeCountTotal,
+            .mallocFreeDelta,
+            .memoryLeakedBytes,
+        ]
+        #else
+        metrics += [
             .memoryLeaked,
             .allocatedResidentMemory,
         ]
+        #endif
+        return metrics
     }
 
     /// A collection of ARC metrics
@@ -117,7 +158,11 @@ public extension BenchmarkMetric {
             .mallocCountSmall,
             .mallocCountLarge,
             .mallocCountTotal,
+            .freeCountTotal,
+            .mallocBytesCount,
+            .mallocFreeDelta,
             .memoryLeaked,
+            .memoryLeakedBytes,
             .syscalls,
             .contextSwitches,
             .threads,
diff --git a/Sources/Benchmark/BenchmarkMetric.swift b/Sources/Benchmark/BenchmarkMetric.swift
index b5d06096..13ea5128 100644
--- a/Sources/Benchmark/BenchmarkMetric.swift
+++ b/Sources/Benchmark/BenchmarkMetric.swift
@@ -32,16 +32,43 @@ public enum BenchmarkMetric: Hashable, Equatable, Codable, CustomStringConvertib
     /// Measure virtual memory usage - sampled during runtime
     case peakMemoryVirtual
     /// Number of small malloc calls
+    ///
+    /// The small/large split is backend-dependent: the jemalloc backend (Swift ≤6.2) splits on
+    /// jemalloc's size classes, while the 6.3+ interposer backend splits on a coarser
+    /// `requested size > page size` threshold.
     case mallocCountSmall
     /// Number of large malloc calls
+    ///
+    /// The backend-specific counterpart to ``mallocCountSmall``.
     case mallocCountLarge
-    /// Number of small+large mallocs
+    /// Number of total malloc calls
     case mallocCountTotal
+    /// Number of total free calls
+    case freeCountTotal
+    /// The amount of memory allocated in bytes through malloc calls
+    case mallocBytesCount
+    /// Net unfreed allocation count within the measured region.
+    ///
+    /// Reports `malloc` count minus `free` count from the interposer backend. Because counting is
+    /// process-global, this metric is only reliable for single-threaded benchmarks with quiescent
+    /// background allocation.
+    case mallocFreeDelta
     /// The amount of allocated resident memory according to the memory allocator
-    /// by the application (does not include metadata overhead etc)
+    /// by the application (does not include metadata overhead etc).
+    ///
+    /// > Deprecated: Only produced by the jemalloc backend (Swift ≤6.2). The 6.3+
+    /// > interposer backend does not measure resident memory — use ``mallocBytesCount``
+    /// > for gross allocated bytes, or ``peakMemoryResident`` for OS-sampled resident memory.
+    @available(*, deprecated, message: "Only produced by the jemalloc backend; use mallocBytesCount or peakMemoryResident")
     case allocatedResidentMemory
-    /// Number of small+large mallocs - small+large frees in resident memory
+    /// Legacy jemalloc resident-byte growth within the measured region.
+    ///
+    /// Only produced by the jemalloc backend (Swift ≤6.2). The 6.3+ interposer backend does not
+    /// produce this metric; use ``mallocFreeDelta`` for allocation-count delta or
+    /// ``memoryLeakedBytes`` for requested-byte delta.
     case memoryLeaked
+    /// Net unfreed requested bytes within the measured region.
+    case memoryLeakedBytes
     /// Measure number of syscalls made during the test
     case syscalls
     /// Measure number of context switches made during the test
@@ -120,7 +147,8 @@ public extension BenchmarkMetric {
         switch self {
         case .cpuSystem, .cpuTotal, .cpuUser, .wallClock:
             return true
-        case .mallocCountLarge, .mallocCountSmall, .mallocCountTotal, .memoryLeaked:
+        case .mallocCountSmall, .mallocCountLarge, .mallocCountTotal, .freeCountTotal, .mallocFreeDelta,
+             .mallocBytesCount, .memoryLeaked, .memoryLeakedBytes:
             return true
         case .syscalls:
             return true
@@ -132,7 +160,7 @@ public extension BenchmarkMetric {
             return true
         case .objectAllocCount, .retainCount, .releaseCount, .retainReleaseDelta:
             return true
-        case let .custom(_, _, useScaleFactor):
+        case .custom(_, _, let useScaleFactor):
             return useScaleFactor
         default:
             return false
@@ -144,7 +172,7 @@ public extension BenchmarkMetric {
         switch self {
         case .throughput:
             return .prefersLarger
-        case let .custom(_, polarity, _):
+        case .custom(_, let polarity, _):
             return polarity
         default:
             return .prefersSmaller
@@ -175,10 +203,16 @@ public extension BenchmarkMetric {
             return "Malloc (large)"
         case .mallocCountTotal:
             return "Malloc (total)"
+        case .mallocBytesCount:
+            return "Malloc (bytes total)"
+        case .mallocFreeDelta:
+            return "Malloc / free Δ"
         case .allocatedResidentMemory:
             return "Memory (allocated resident)"
         case .memoryLeaked:
-            return "Malloc / free Δ"
+            return "Memory leaked (resident)"
+        case .memoryLeakedBytes:
+            return "Malloc / free Δ (bytes)"
         case .syscalls:
             return "Syscalls (total)"
         case .contextSwitches:
@@ -213,8 +247,10 @@ public extension BenchmarkMetric {
             return "Δ"
         case .deltaPercentage:
             return "Δ %"
-        case let .custom(name, _, _):
+        case .custom(let name, _, _):
             return name
+        case .freeCountTotal:
+            return "Free (total)"
         }
     }
 
@@ -244,47 +280,55 @@ public extension BenchmarkMetric {
             return 10
         case .mallocCountTotal:
             return 11
-        case .allocatedResidentMemory:
+        case .freeCountTotal:
             return 12
-        case .memoryLeaked:
+        case .mallocBytesCount:
             return 13
-        case .syscalls:
+        case .allocatedResidentMemory:
             return 14
-        case .contextSwitches:
+        case .memoryLeaked:
             return 15
-        case .threads:
+        case .memoryLeakedBytes:
             return 16
-        case .threadsRunning:
+        case .syscalls:
             return 17
-        case .readSyscalls:
+        case .contextSwitches:
             return 18
-        case .writeSyscalls:
+        case .threads:
             return 19
-        case .readBytesLogical:
+        case .threadsRunning:
             return 20
-        case .writeBytesLogical:
+        case .readSyscalls:
             return 21
-        case .readBytesPhysical:
+        case .writeSyscalls:
             return 22
-        case .writeBytesPhysical:
+        case .readBytesLogical:
             return 23
-        case .objectAllocCount:
+        case .writeBytesLogical:
             return 24
-        case .retainCount:
+        case .readBytesPhysical:
             return 25
-        case .releaseCount:
+        case .writeBytesPhysical:
             return 26
-        case .retainReleaseDelta:
+        case .objectAllocCount:
             return 27
-        case .instructions:
+        case .retainCount:
             return 28
+        case .releaseCount:
+            return 29
+        case .retainReleaseDelta:
+            return 30
+        case .instructions:
+            return 31
+        case .mallocFreeDelta:
+            return 32
         default:
             return 0 // custom payloads must be stored in dictionary
         }
     }
 
     @_documentation(visibility: internal)
-    static var maxIndex: Int { 28 } //
+    static var maxIndex: Int { 32 } //
 
     // Used by the Benchmark Executor for efficient indexing into results
     @_documentation(visibility: internal)
@@ -313,39 +357,47 @@ public extension BenchmarkMetric {
         case 11:
             return .mallocCountTotal
         case 12:
-            return .allocatedResidentMemory
+            return .freeCountTotal
         case 13:
-            return .memoryLeaked
+            return .mallocBytesCount
         case 14:
-            return .syscalls
+            return .allocatedResidentMemory
         case 15:
-            return .contextSwitches
+            return .memoryLeaked
         case 16:
-            return .threads
+            return .memoryLeakedBytes
         case 17:
-            return .threadsRunning
+            return .syscalls
         case 18:
-            return .readSyscalls
+            return .contextSwitches
         case 19:
-            return .writeSyscalls
+            return .threads
         case 20:
-            return .readBytesLogical
+            return .threadsRunning
         case 21:
-            return .writeBytesLogical
+            return .readSyscalls
         case 22:
-            return .readBytesPhysical
+            return .writeSyscalls
         case 23:
-            return .writeBytesPhysical
+            return .readBytesLogical
         case 24:
-            return .objectAllocCount
+            return .writeBytesLogical
         case 25:
-            return .retainCount
+            return .readBytesPhysical
         case 26:
-            return .releaseCount
+            return .writeBytesPhysical
         case 27:
-            return .retainReleaseDelta
+            return .objectAllocCount
         case 28:
+            return .retainCount
+        case 29:
+            return .releaseCount
+        case 30:
+            return .retainReleaseDelta
+        case 31:
             return .instructions
+        case 32:
+            return .mallocFreeDelta
         default:
             break
         }
@@ -379,10 +431,18 @@ public extension BenchmarkMetric {
             return "mallocCountLarge"
         case .mallocCountTotal:
             return "mallocCountTotal"
+        case .freeCountTotal:
+            return "freeCountTotal"
+        case .mallocBytesCount:
+            return "mallocBytesCount"
+        case .mallocFreeDelta:
+            return "mallocFreeDelta"
         case .allocatedResidentMemory:
             return "allocatedResidentMemory"
         case .memoryLeaked:
             return "memoryLeaked"
+        case .memoryLeakedBytes:
+            return "memoryLeakedBytes"
         case .syscalls:
             return "syscalls"
         case .contextSwitches:
@@ -417,7 +477,7 @@ public extension BenchmarkMetric {
             return "Δ"
         case .deltaPercentage:
             return "Δ %"
-        case let .custom(name, _, _):
+        case .custom(let name, _, _):
             return name
         }
     }
@@ -451,10 +511,18 @@ public extension BenchmarkMetric {
             self = BenchmarkMetric.mallocCountLarge
         case "mallocCountTotal":
             self = BenchmarkMetric.mallocCountTotal
+        case "freeCountTotal":
+            self = BenchmarkMetric.freeCountTotal
+        case "mallocBytesCount":
+            self = BenchmarkMetric.mallocBytesCount
+        case "mallocFreeDelta":
+            self = BenchmarkMetric.mallocFreeDelta
         case "allocatedResidentMemory":
             self = BenchmarkMetric.allocatedResidentMemory
         case "memoryLeaked":
             self = BenchmarkMetric.memoryLeaked
+        case "memoryLeakedBytes":
+            self = BenchmarkMetric.memoryLeakedBytes
         case "syscalls":
             self = BenchmarkMetric.syscalls
         case "contextSwitches":
diff --git a/Sources/Benchmark/BenchmarkRunner.swift b/Sources/Benchmark/BenchmarkRunner.swift
index f4b7c2b0..99d1171c 100644
--- a/Sources/Benchmark/BenchmarkRunner.swift
+++ b/Sources/Benchmark/BenchmarkRunner.swift
@@ -10,6 +10,9 @@
 
 import ArgumentParser
 import BenchmarkShared
+#if canImport(MallocInterposerSwift)
+import MallocInterposerSwift
+#endif
 #if os(Linux) && compiler(>=6.3) && canImport(SwiftRuntimeInterposerSwift)
 import SwiftRuntimeInterposerSwift
 #endif
@@ -117,6 +120,9 @@ public struct BenchmarkRunner: AsyncParsableCommand, BenchmarkRunnerReadWrite {
 
         var debugIterator = Benchmark.benchmarks.makeIterator()
         var benchmarkCommand: BenchmarkCommandRequest
+        #if canImport(MallocInterposerSwift)
+        MallocInterposerSwift.initialize()
+        #endif
         #if os(Linux) && compiler(>=6.3) && canImport(SwiftRuntimeInterposerSwift)
         SwiftRuntimeInterposerSwift.initialize()
         #endif
@@ -155,7 +161,7 @@ public struct BenchmarkRunner: AsyncParsableCommand, BenchmarkRunnerReadWrite {
                 }
 
                 try channel.write(.end)
-            case let .run(benchmarkToRun):
+            case .run(let benchmarkToRun):
                 benchmark = Benchmark.benchmarks.first { $0.name == benchmarkToRun.name }
 
                 if let benchmark {
diff --git a/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md b/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md
index ad9a7ab2..5413bfde 100644
--- a/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md
+++ b/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md
@@ -27,6 +27,10 @@
 - ``BenchmarkMetric/wallClock``
 - ``BenchmarkMetric/cpuTotal``
 - ``BenchmarkMetric/mallocCountTotal``
+- ``BenchmarkMetric/freeCountTotal``
+- ``BenchmarkMetric/mallocBytesCount``
+- ``BenchmarkMetric/mallocFreeDelta``
+- ``BenchmarkMetric/memoryLeakedBytes``
 - ``BenchmarkMetric/throughput``
 - ``BenchmarkMetric/peakMemoryResident``
 - ``BenchmarkMetric/memoryLeaked``
@@ -41,7 +45,11 @@
 - ``BenchmarkMetric/mallocCountSmall``
 - ``BenchmarkMetric/mallocCountLarge``
 - ``BenchmarkMetric/mallocCountTotal``
+- ``BenchmarkMetric/freeCountTotal``
+- ``BenchmarkMetric/mallocBytesCount``
+- ``BenchmarkMetric/mallocFreeDelta``
 - ``BenchmarkMetric/memoryLeaked``
+- ``BenchmarkMetric/memoryLeakedBytes``
 - ``BenchmarkMetric/allocatedResidentMemory``
 
 ### Reference Counting (retain/release)
diff --git a/Sources/Benchmark/Documentation.docc/Metrics.md b/Sources/Benchmark/Documentation.docc/Metrics.md
index a5b790bf..71ce4ad1 100644
--- a/Sources/Benchmark/Documentation.docc/Metrics.md
+++ b/Sources/Benchmark/Documentation.docc/Metrics.md
@@ -18,11 +18,15 @@ Currently supported metrics are:
 - term `peakMemoryResident`: The resident memory usage - sampled during runtime
 - term `peakMemoryResidentDelta`: The resident memory usage - sampled during runtime (excluding start of benchmark baseline) 
 - term `peakMemoryVirtual`:  The virtual memory usage - sampled during runtime
-- term `mallocCountSmall`: The number of small malloc calls according to jemalloc
-- term `mallocCountLarge`: The number of large malloc calls according to jemalloc
-- term `mallocCountTotal`: The total number of mallocs according to jemalloc
+- term `mallocCountSmall`: The number of small malloc calls according to the active malloc backend
+- term `mallocCountLarge`: The number of large malloc calls according to the active malloc backend
+- term `mallocCountTotal`: The total number of malloc calls according to the active malloc backend
+- term `freeCountTotal`: The total number of free calls according to the interposer backend
+- term `mallocBytesCount`: The total requested bytes allocated through malloc calls according to the interposer backend
+- term `mallocFreeDelta`: The number of malloc calls minus free calls according to the interposer backend
 - term `allocatedResidentMemory`: The amount of allocated resident memory by the application (not including allocator metadata overhead etc) according to jemalloc
-- term `memoryLeaked`: The number of small+large mallocs - small+large frees in resident memory (just a possible leak)
+- term `memoryLeaked`: Legacy jemalloc resident-byte growth within the measured region
+- term `memoryLeakedBytes`: The requested bytes allocated minus requested bytes freed according to the interposer backend
 - term `syscalls`: The number of syscalls made during the test -- macOS only
 - term `contextSwitches`: The number of context switches made during the test -- macOS only
 - term `threads`: The maximum number of threads in the process under the test (not exact, sampled)
diff --git a/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md b/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md
index cb48bffc..45482407 100644
--- a/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md
+++ b/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md
@@ -91,8 +91,9 @@ OPTIONS:
 Benchmark targets matching the regexp filter that should be skipped
 --format <format>       The output format to use, default is 'text' (values: text, markdown, influx, jmh, histogramEncoded, histogram, histogramSamples, histogramPercentiles, metricP90AbsoluteThresholds)
 --metric <metric>       Specifies that the benchmark run should use one or more specific metrics instead of the ones defined by the benchmarks. (values: cpuUser, cpuSystem, cpuTotal, wallClock, throughput,
-peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, allocatedResidentMemory, memoryLeaked, syscalls, contextSwitches, threads,
-threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom)
+peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, freeCountTotal, mallocBytesCount, mallocFreeDelta, allocatedResidentMemory,
+memoryLeaked, memoryLeakedBytes, syscalls, contextSwitches, threads, threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions,
+retainCount, releaseCount, retainReleaseDelta, custom)
 --path <path>           The path to operate on for data export or threshold operations, default is the current directory (".") for exports and the ("./Thresholds") directory for thresholds. 
 --quiet                 Specifies that output should be suppressed (useful for if you just want to check return code)
 --scale                 Specifies that some of the text output should be scaled using the scalingFactor (denoted by '*' in output)
diff --git a/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift b/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift
index 38e34761..ed2dd8de 100644
--- a/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift
+++ b/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift
@@ -15,6 +15,8 @@
 // let optionString = "J"
 // malloc_stats_print(nil, nil, optionString)
 
+#if canImport(jemalloc)
+
 // MARK: - Pokedex
 
 struct Pokedex: Codable {
@@ -361,3 +363,5 @@ struct MergedLextent: Codable {
 }
 
 // swiftlint:enable all
+
+#endif
diff --git a/Tests/BenchmarkTests/BenchmarkMetricsTests.swift b/Tests/BenchmarkTests/BenchmarkMetricsTests.swift
index e3822010..e5f82a8f 100644
--- a/Tests/BenchmarkTests/BenchmarkMetricsTests.swift
+++ b/Tests/BenchmarkTests/BenchmarkMetricsTests.swift
@@ -22,11 +22,13 @@ final class BenchmarkMetricsTests: XCTestCase {
         .peakMemoryResident,
         .peakMemoryResidentDelta,
         .peakMemoryVirtual,
-        .mallocCountSmall,
-        .mallocCountLarge,
         .mallocCountTotal,
+        .mallocBytesCount,
+        .freeCountTotal,
+        .mallocFreeDelta,
         .allocatedResidentMemory,
         .memoryLeaked,
+        .memoryLeakedBytes,
         .syscalls,
         .contextSwitches,
         .threads,
@@ -55,11 +57,13 @@ final class BenchmarkMetricsTests: XCTestCase {
         "peakMemoryResident",
         "peakMemoryResidentDelta",
         "peakMemoryVirtual",
-        "mallocCountSmall",
-        "mallocCountLarge",
         "mallocCountTotal",
+        "mallocBytesCount",
+        "freeCountTotal",
+        "mallocFreeDelta",
         "allocatedResidentMemory",
         "memoryLeaked",
+        "memoryLeakedBytes",
         "syscalls",
         "contextSwitches",
         "threads",
diff --git a/Tests/BenchmarkTests/MallocStatisticsTests.swift b/Tests/BenchmarkTests/MallocStatisticsTests.swift
new file mode 100644
index 00000000..bd7718e9
--- /dev/null
+++ b/Tests/BenchmarkTests/MallocStatisticsTests.swift
@@ -0,0 +1,116 @@
+//
+// Copyright (c) 2026 Ordo One AB.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+
+import XCTest
+
+@testable import Benchmark
+
+/// Unit coverage for the interposer malloc-metric arithmetic and the malloc-metric scaling
+/// configuration. These exercise `BenchmarkExecutor.mallocStatistics(...)` directly with
+/// synthetic counter deltas, so no live interposer / allocation is required.
+final class MallocStatisticsTests: XCTestCase {
+    private func value(
+        _ metrics: [(metric: BenchmarkMetric, value: Int)],
+        _ wanted: BenchmarkMetric
+    ) -> Int? {
+        metrics.first { $0.metric == wanted }?.value
+    }
+
+    func testBalancedAllocFreeReportsNoLeak() {
+        let metrics = BenchmarkExecutor.mallocStatistics(
+            mallocCountDelta: 10, mallocBytesDelta: 1_024,
+            mallocSmallDelta: 8, mallocLargeDelta: 2,
+            freeCountDelta: 10, freeBytesDelta: 1_024
+        )
+        XCTAssertEqual(value(metrics, .mallocCountTotal), 10)
+        XCTAssertEqual(value(metrics, .freeCountTotal), 10)
+        XCTAssertEqual(value(metrics, .mallocBytesCount), 1_024)
+        XCTAssertEqual(value(metrics, .mallocFreeDelta), 0)
+        XCTAssertEqual(value(metrics, .memoryLeakedBytes), 0)
+        XCTAssertNil(value(metrics, .memoryLeaked), "interposer stats must not emit the legacy jemalloc memoryLeaked metric")
+    }
+
+    func testUnbalancedAllocReportsLeak() {
+        let metrics = BenchmarkExecutor.mallocStatistics(
+            mallocCountDelta: 10, mallocBytesDelta: 2_048,
+            mallocSmallDelta: 7, mallocLargeDelta: 3,
+            freeCountDelta: 6, freeBytesDelta: 1_024
+        )
+        XCTAssertEqual(value(metrics, .mallocFreeDelta), 4) // 10 mallocs - 6 frees
+        XCTAssertEqual(value(metrics, .memoryLeakedBytes), 1_024) // 2048 - 1024
+    }
+
+    /// A window that frees more than it allocates (e.g. freeing a warmup survivor or cross-thread
+    /// frees) must clamp the leak to 0 — not go negative (which `Statistics.add` would silently
+    /// drop, desyncing the sample count and biasing the average upward).
+    func testNetFreeWindowClampsLeakToZero() {
+        let metrics = BenchmarkExecutor.mallocStatistics(
+            mallocCountDelta: 3, mallocBytesDelta: 256,
+            mallocSmallDelta: 3, mallocLargeDelta: 0,
+            freeCountDelta: 5, freeBytesDelta: 4_096
+        )
+        XCTAssertEqual(value(metrics, .mallocFreeDelta), 0)
+        XCTAssertEqual(value(metrics, .memoryLeakedBytes), 0)
+    }
+
+    /// `mallocStatistics` is a pure mapping: each counter delta must land in its own metric slot
+    /// unchanged, so a mis-routing of any single delta fails distinctly. (The `small + large == total`
+    /// invariant is a property of the interposer's counters, not of this function, so it cannot be
+    /// asserted at this layer.)
+    func testDeltasRouteToCorrectMetricSlots() {
+        let metrics = BenchmarkExecutor.mallocStatistics(
+            mallocCountDelta: 10, mallocBytesDelta: 100,
+            mallocSmallDelta: 6, mallocLargeDelta: 4,
+            freeCountDelta: 3, freeBytesDelta: 48
+        )
+        XCTAssertEqual(value(metrics, .mallocCountTotal), 10)
+        XCTAssertEqual(value(metrics, .mallocCountSmall), 6)
+        XCTAssertEqual(value(metrics, .mallocCountLarge), 4)
+        XCTAssertEqual(value(metrics, .mallocBytesCount), 100)
+        XCTAssertEqual(value(metrics, .freeCountTotal), 3)
+        XCTAssertEqual(value(metrics, .mallocFreeDelta), 7)
+    }
+
+    /// The whole per-iteration malloc count/byte family must scale together, otherwise the scaled
+    /// output is internally inconsistent (e.g. `small + large != total`, or bytes not comparable
+    /// to free) under a non-unit `scalingFactor`.
+    func testMallocFamilyScalesConsistently() {
+        let scaledFamily: [BenchmarkMetric] = [
+            .mallocCountSmall, .mallocCountLarge, .mallocCountTotal,
+            .freeCountTotal, .mallocBytesCount, .mallocFreeDelta, .memoryLeakedBytes,
+        ]
+        for metric in scaledFamily {
+            XCTAssertTrue(
+                metric.useScalingFactor,
+                "\(metric.rawDescription) must scale with the rest of the malloc family"
+            )
+        }
+    }
+
+    func testDefaultMetricsUseBackendSpecificLeakMetrics() {
+        #if canImport(MallocInterposerSwift)
+        XCTAssertTrue(BenchmarkMetric.default.contains(.mallocFreeDelta))
+        XCTAssertTrue(BenchmarkMetric.default.contains(.memoryLeakedBytes))
+        XCTAssertFalse(
+            BenchmarkMetric.default.contains(.memoryLeaked),
+            "interposer defaults must not emit legacy jemalloc memoryLeaked"
+        )
+        #else
+        XCTAssertTrue(BenchmarkMetric.default.contains(.memoryLeaked))
+        XCTAssertFalse(BenchmarkMetric.default.contains(.mallocFreeDelta))
+        #endif
+    }
+
+    /// Metric array slots must be unique so two metrics never collide on the same `statistics` slot.
+    func testMetricIndicesAreUnique() {
+        let indices = BenchmarkMetric.all.map(\.index)
+        XCTAssertEqual(Set(indices).count, indices.count, "metric indices must be unique")
+    }
+}
diff --git a/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift b/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift
index c5249d37..63667f7c 100644
--- a/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift
+++ b/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift
@@ -62,24 +62,6 @@ final class OperatingSystemAndMallocTests: XCTestCase {
         blackHole(operatingSystemStatsProducer.metricSupported(.throughput))
     }
 
-    #if canImport(jemalloc)
-    func testMallocProducerLeaks() throws {
-        let startMallocStats = MallocStatsProducer.makeMallocStats()
-
-        for outerloop in 1...100 {
-            blackHole(malloc(outerloop * 1_024))
-        }
-
-        let stopMallocStats = MallocStatsProducer.makeMallocStats()
-
-        XCTAssertGreaterThanOrEqual(stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal, 100)
-        XCTAssertGreaterThanOrEqual(
-            stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory,
-            100 * 1_024
-        )
-    }
-    #endif
-
     func testARCStatsProducer() throws {
         let array = [3]
         ARCStatsProducer.hook()
diff --git a/scripts/bench_malloc.c b/scripts/bench_malloc.c
new file mode 100644
index 00000000..0af0de62
--- /dev/null
+++ b/scripts/bench_malloc.c
@@ -0,0 +1,128 @@
+// bench_malloc.c — standalone wallclock benchmark for malloc/free patterns.
+//
+// Build once, run twice (with and without jemalloc injected via
+// DYLD_INSERT_LIBRARIES on macOS / LD_PRELOAD on Linux). See
+// scripts/bench_malloc.sh.
+//
+// Each benchmark runs an inner loop N times; we run K trials of that and
+// report min / median / max ns per op so noise is visible.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define WARMUP_ITERS 1000
+#define TRIALS 9   // odd → median is a single sample
+
+// Volatile sink prevents the compiler from optimizing alloc/free pairs away.
+static volatile void *sink;
+
+static double now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec * 1e9 + (double)ts.tv_nsec;
+}
+
+static int cmp_double(const void *a, const void *b) {
+    double da = *(const double *)a, db = *(const double *)b;
+    return (da > db) - (da < db);
+}
+
+// ---- benchmark bodies ----
+
+#define DEFINE_BENCH(NAME, BODY)                  \
+    static void bench_##NAME(int iters) {         \
+        for (int _i = 0; _i < iters; _i++) {      \
+            BODY                                  \
+        }                                         \
+    }
+
+DEFINE_BENCH(malloc_64, {
+    void *p = malloc(64); sink = p; free(p);
+})
+
+DEFINE_BENCH(malloc_2mb, {
+    void *p = malloc(2 * 1024 * 1024); sink = p; free(p);
+})
+
+DEFINE_BENCH(calloc_8x8, {
+    void *p = calloc(8, 8); sink = p; free(p);
+})
+
+DEFINE_BENCH(realloc_grow, {
+    void *p = malloc(64);
+    p = realloc(p, 256);
+    sink = p;
+    free(p);
+})
+
+DEFINE_BENCH(realloc_null, {
+    void *p = realloc(NULL, 128); sink = p; free(p);
+})
+
+DEFINE_BENCH(posix_memalign_1k, {
+    void *p = NULL;
+    (void)posix_memalign(&p, 64, 1024);
+    sink = p;
+    free(p);
+})
+
+DEFINE_BENCH(malloc_x16, {
+    void *ptrs[16];
+    for (int i = 0; i < 16; i++) ptrs[i] = malloc(48);
+    sink = ptrs[0]; // defeat clang's malloc/free elision at -O2
+    for (int i = 0; i < 16; i++) free(ptrs[i]);
+})
+
+// ---- runner ----
+
+typedef void (*bench_fn)(int);
+
+typedef struct {
+    const char *name;
+    bench_fn    fn;
+    int         inner;   // iterations inside one trial
+} bench_t;
+
+#define B(NAME, INNER) { #NAME, bench_##NAME, INNER }
+
+static const bench_t benchmarks[] = {
+    B(malloc_64,           1000000),
+    B(calloc_8x8,          1000000),
+    B(realloc_null,        1000000),
+    B(realloc_grow,         500000),
+    B(posix_memalign_1k,   1000000),
+    B(malloc_x16,           200000),
+    B(malloc_2mb,            10000),
+};
+
+int main(void) {
+    const char *label = getenv("BENCH_LABEL");
+    if (!label) label = "(no label)";
+
+    printf("== %s ==\n", label);
+    printf("%-22s %12s %12s %12s\n", "benchmark", "min ns/op", "median ns/op", "max ns/op");
+    printf("%-22s %12s %12s %12s\n", "---------", "---------", "------------", "---------");
+
+    size_t n = sizeof(benchmarks) / sizeof(benchmarks[0]);
+    for (size_t i = 0; i < n; i++) {
+        const bench_t *b = &benchmarks[i];
+
+        // Warmup
+        b->fn(WARMUP_ITERS);
+
+        double trials[TRIALS];
+        for (int t = 0; t < TRIALS; t++) {
+            double t0 = now_ns();
+            b->fn(b->inner);
+            double t1 = now_ns();
+            trials[t] = (t1 - t0) / (double)b->inner;
+        }
+        qsort(trials, TRIALS, sizeof(double), cmp_double);
+
+        printf("%-22s %12.2f %12.2f %12.2f\n",
+               b->name, trials[0], trials[TRIALS / 2], trials[TRIALS - 1]);
+    }
+    return 0;
+}
diff --git a/scripts/bench_malloc.sh b/scripts/bench_malloc.sh
new file mode 100755
index 00000000..d8a8f58a
--- /dev/null
+++ b/scripts/bench_malloc.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+#
+# bench_malloc.sh — build scripts/bench_malloc.c once and run it twice:
+# under the system allocator and under jemalloc. Uses runtime injection
+# (DYLD_INSERT_LIBRARIES on macOS, LD_PRELOAD on Linux), so there's no
+# link-time difference between the two runs.
+#
+# Pre-requisites:
+#   - macOS: `brew install jemalloc` (or override JEMALLOC_LIB)
+#   - Linux: jemalloc installed (e.g. `apt install libjemalloc2`)
+#
+# Usage:
+#   ./scripts/bench_malloc.sh
+#
+# Env overrides:
+#   JEMALLOC_LIB   path to libjemalloc.{dylib,so}; auto-detected if unset.
+#   CC             compiler; defaults to cc.
+#   CFLAGS         extra cflags; defaults to "-O2 -Wall -Wextra".
+
+set -euo pipefail
+
+# Use clang explicitly — `cc` is aliased to other things in many shells.
+CC="${CC:-$(command -v clang || command -v gcc || echo cc)}"
+CFLAGS="${CFLAGS:--O2 -Wall -Wextra}"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SRC="${SCRIPT_DIR}/bench_malloc.c"
+BIN="$(mktemp -t bench_malloc.XXXXXX)"
+trap 'rm -f "$BIN"' EXIT
+
+step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; }
+fail() { printf '\033[31m## %s\033[0m\n' "$*" >&2; exit 1; }
+
+# --- locate jemalloc ---
+if [[ -z "${JEMALLOC_LIB:-}" ]]; then
+    case "$(uname -s)" in
+        Darwin)
+            for cand in \
+                /opt/homebrew/opt/jemalloc/lib/libjemalloc.2.dylib \
+                /opt/homebrew/opt/jemalloc/lib/libjemalloc.dylib \
+                /usr/local/opt/jemalloc/lib/libjemalloc.2.dylib \
+                /usr/local/opt/jemalloc/lib/libjemalloc.dylib; do
+                if [[ -f "$cand" ]]; then JEMALLOC_LIB="$cand"; break; fi
+            done
+            ;;
+        Linux)
+            for cand in \
+                /usr/lib/x86_64-linux-gnu/libjemalloc.so.2 \
+                /usr/lib/aarch64-linux-gnu/libjemalloc.so.2 \
+                /usr/lib64/libjemalloc.so.2 \
+                /usr/lib/libjemalloc.so.2 \
+                /usr/lib/x86_64-linux-gnu/libjemalloc.so \
+                /usr/lib/libjemalloc.so; do
+                if [[ -f "$cand" ]]; then JEMALLOC_LIB="$cand"; break; fi
+            done
+            ;;
+    esac
+fi
+[[ -n "${JEMALLOC_LIB:-}" && -f "$JEMALLOC_LIB" ]] \
+    || fail "jemalloc dylib not found — set JEMALLOC_LIB=/path/to/libjemalloc.{dylib,so}"
+
+# --- build ---
+step "Compiling $SRC"
+# shellcheck disable=SC2086
+"$CC" $CFLAGS -o "$BIN" "$SRC"
+
+# --- run system allocator ---
+step "Run 1 — system allocator"
+BENCH_LABEL="system" "$BIN"
+
+# --- run with jemalloc injected ---
+step "Run 2 — jemalloc (injected: $JEMALLOC_LIB)"
+case "$(uname -s)" in
+    Darwin)
+        BENCH_LABEL="jemalloc" \
+            DYLD_INSERT_LIBRARIES="$JEMALLOC_LIB" \
+            DYLD_FORCE_FLAT_NAMESPACE=1 \
+            "$BIN"
+        ;;
+    Linux)
+        BENCH_LABEL="jemalloc" \
+            LD_PRELOAD="$JEMALLOC_LIB" \
+            "$BIN"
+        ;;
+    *)
+        fail "Unsupported platform: $(uname -s)"
+        ;;
+esac
diff --git a/scripts/compare-malloc-local.sh b/scripts/compare-malloc-local.sh
new file mode 100755
index 00000000..4b3ab492
--- /dev/null
+++ b/scripts/compare-malloc-local.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+#
+# compare-malloc-local.sh — compare malloc counts between the legacy jemalloc
+# path (Swift 6.2 → Package@swift-6.2.swift) and the new custom interposer
+# (Swift 6.3 → Package.swift) using THIS repo's local
+# `MallocInterposerBenchmarks` target.
+#
+# These benchmarks have predictable per-iteration allocation counts, so any
+# drift between the two code paths is a regression. For "real workload"
+# comparison against swift-nio, see compare-malloc.sh instead.
+#
+# Mechanism:
+#   1. Runs `swift package benchmark baseline update <name>` once per
+#      toolchain via swiftly. SwiftPM picks the right Package*.swift
+#      manifest for each toolchain automatically.
+#   2. Calls `baseline compare` for the two recorded baselines.
+#
+# Pre-requisites:
+#   - swiftly with both toolchains installed.
+#
+# Usage:
+#   ./scripts/compare-malloc-local.sh [filter ...]
+#
+# Each positional arg becomes a `--filter` regex. With no args every
+# benchmark in the target runs.
+#
+# Env overrides:
+#   TOOLCHAIN_OLD   default 6.2.2
+#   TOOLCHAIN_NEW   default 6.3-snapshot-2026-02-27
+#   FRESH=1         use timestamp-suffixed scratch dirs (fresh build, no
+#                   cache reuse). Use this when a previous hung/zombie
+#                   process is holding a SwiftPM lock on .build-X and you
+#                   can't kill it. Trade-off: full rebuild each run.
+#   KEEP_FRESH=1    when FRESH=1, don't auto-delete the scratch dirs at
+#                   exit (default is to clean up on success).
+
+set -euo pipefail
+
+PB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+BENCH_DIR="${PB_DIR}/Benchmarks"
+TARGET="MallocInterposerBenchmarks"
+TOOLCHAIN_OLD="${TOOLCHAIN_OLD:-6.2.4}"
+TOOLCHAIN_NEW="${TOOLCHAIN_NEW:-6.3}"
+BASELINE_OLD="jemalloc-${TOOLCHAIN_OLD}"
+BASELINE_NEW="interposer-${TOOLCHAIN_NEW}"
+
+# Per-toolchain scratch paths so each toolchain has its own .build cache.
+# Without this, switching toolchains hits "module compiled with Swift X
+# cannot be imported by Y" errors on the cached Benchmark.swiftmodule.
+#
+# If FRESH=1 is set, append a timestamp suffix so this run can't collide
+# with a SwiftPM lock held by a previous (possibly hung) process. Trade-off:
+# no cache reuse — every run rebuilds from scratch.
+SCRATCH_SUFFIX=""
+if [[ "${FRESH:-0}" == "1" ]]; then
+  SCRATCH_SUFFIX="-fresh-$(date +%s)"
+fi
+SCRATCH_OLD="${BENCH_DIR}/.build-${TOOLCHAIN_OLD}${SCRATCH_SUFFIX}"
+SCRATCH_NEW="${BENCH_DIR}/.build-${TOOLCHAIN_NEW}${SCRATCH_SUFFIX}"
+
+step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; }
+warn() { printf '\033[33m!! %s\033[0m\n' "$*" >&2; }
+fail() {
+  printf '\033[31m## %s\033[0m\n' "$*" >&2
+  exit 1
+}
+
+[[ -d "$BENCH_DIR/Benchmarks/MallocInterposer" ]] ||
+  fail "MallocInterposer benchmark dir missing — expected $BENCH_DIR/Benchmarks/MallocInterposer"
+command -v swiftly >/dev/null || fail "swiftly required"
+
+# When FRESH=1, clean the throwaway scratch dirs on successful exit so they
+# don't accumulate. KEEP_FRESH=1 disables this if the user wants to inspect.
+if [[ "${FRESH:-0}" == "1" && "${KEEP_FRESH:-0}" != "1" ]]; then
+  cleanup_fresh() {
+    local rc=$?
+    if ((rc == 0)); then
+      rm -rf "$SCRATCH_OLD" "$SCRATCH_NEW" 2>/dev/null || true
+    else
+      warn "Run failed (exit $rc); leaving fresh scratch dirs for inspection:"
+      warn "  $SCRATCH_OLD"
+      warn "  $SCRATCH_NEW"
+    fi
+  }
+  trap cleanup_fresh EXIT
+fi
+
+cd "$BENCH_DIR"
+
+# Forward any positional args as --filter regexes.
+declare -a FILTER_ARGS=()
+for f in "$@"; do
+  FILTER_ARGS+=(--filter "$f")
+done
+
+run_jemalloc() {
+  step "Run 1: Swift $TOOLCHAIN_OLD (jemalloc) → baseline '$BASELINE_OLD'  [scratch: $SCRATCH_OLD]"
+  swiftly run +"$TOOLCHAIN_OLD" \
+    swift package \
+    --scratch-path "$SCRATCH_OLD" \
+    --allow-writing-to-package-directory benchmark \
+    baseline update "$BASELINE_OLD" \
+    --target "$TARGET" \
+    --quiet --no-progress \
+    "${FILTER_ARGS[@]}"
+}
+
+run_interposer() {
+  step "Run 2: Swift $TOOLCHAIN_NEW (interposer) → baseline '$BASELINE_NEW'  [scratch: $SCRATCH_NEW]"
+  swiftly run +"$TOOLCHAIN_NEW" \
+    swift package \
+    --scratch-path "$SCRATCH_NEW" \
+    --allow-writing-to-package-directory benchmark \
+    baseline update "$BASELINE_NEW" \
+    --target "$TARGET" \
+    --quiet --no-progress \
+    "${FILTER_ARGS[@]}"
+}
+
+run_jemalloc
+run_interposer
+
+step "Comparison: $BASELINE_OLD  vs  $BASELINE_NEW"
+swiftly run +"$TOOLCHAIN_NEW" \
+  swift package \
+  --scratch-path "$SCRATCH_NEW" \
+  benchmark baseline compare "$BASELINE_OLD" "$BASELINE_NEW" \
+  --target "$TARGET"
diff --git a/scripts/wrapper_overhead.c b/scripts/wrapper_overhead.c
new file mode 100644
index 00000000..008f22a7
--- /dev/null
+++ b/scripts/wrapper_overhead.c
@@ -0,0 +1,94 @@
+// wrapper_overhead.c — measure the cost of "being a wrapper" in isolation,
+// and (optionally) the additional cost of the real interposer's bookkeeping.
+//
+// Run the same malloc/free hot loop two or three times:
+//   1. With nothing preloaded → user code → libc allocator.
+//   2. With wrapper_overhead_passthrough.dylib preloaded → user code → our
+//      one-instruction tail-call wrapper → libc allocator.
+//      Delta from #1 = wrapper layer cost (no bookkeeping at all).
+//   3. (Optional) With the real malloc-interposer preloaded and counting
+//      enabled. Delta from #2 = bookkeeping cost (header + magic check +
+//      enable check + TLS pointer load + counter writes).
+//
+// To enable run #3, set INTERPOSER_DYLIB in the environment to the path of
+// the full interposer dylib/so. The harness will dlsym
+// `malloc_interposer_enable` and call it at startup so counting is on for
+// every measured iteration.
+//
+// Build + drive: see wrapper_overhead.sh in the same directory.
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define WARMUP_ITERS   10000
+#define INNER_ITERS  2000000
+#define TRIALS             9
+
+static volatile void *sink;
+
+static double now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec * 1e9 + (double)ts.tv_nsec;
+}
+
+static int cmp_double(const void *a, const void *b) {
+    double da = *(const double *)a, db = *(const double *)b;
+    return (da > db) - (da < db);
+}
+
+static void measure_pair(const char *name, size_t size) {
+    // Warmup primes tcache and lets dyld bind any lazy stubs.
+    for (int i = 0; i < WARMUP_ITERS; i++) {
+        void *p = malloc(size);
+        sink = p;
+        free(p);
+    }
+
+    double trials[TRIALS];
+    for (int t = 0; t < TRIALS; t++) {
+        double t0 = now_ns();
+        for (int i = 0; i < INNER_ITERS; i++) {
+            void *p = malloc(size);
+            sink = p;
+            free(p);
+        }
+        trials[t] = (now_ns() - t0) / (double)INNER_ITERS;
+    }
+    qsort(trials, TRIALS, sizeof(double), cmp_double);
+
+    printf("%-18s %10.2f %10.2f %10.2f\n",
+           name, trials[0], trials[TRIALS / 2], trials[TRIALS - 1]);
+}
+
+int main(void) {
+    const char *label = getenv("BENCH_LABEL");
+    if (!label) label = "(no label)";
+
+    // If the real malloc-interposer is preloaded, flip its counting on so we
+    // measure the full bookkeeping cost (header + magic check + enable check
+    // + TLS access + counter writes). dlsym returns NULL for the pass-through
+    // wrapper and for the plain libc run, which is exactly what we want.
+    void (*enable_fn)(void) = (void (*)(void))dlsym(RTLD_DEFAULT,
+                                                    "malloc_interposer_enable");
+    void (*reset_fn)(void)  = (void (*)(void))dlsym(RTLD_DEFAULT,
+                                                    "malloc_interposer_reset");
+    if (enable_fn) {
+        if (reset_fn) reset_fn();
+        enable_fn();
+        fprintf(stderr, "[%s] interposer counting enabled\n", label);
+    }
+
+    printf("== %s ==\n", label);
+    printf("%-18s %10s %10s %10s\n", "size", "min ns", "median", "max ns");
+    printf("%-18s %10s %10s %10s\n", "----", "------", "------", "------");
+
+    measure_pair("malloc(64)+free",       64);
+    measure_pair("malloc(256)+free",     256);
+    measure_pair("malloc(1024)+free",   1024);
+    measure_pair("malloc(4096)+free",   4096);
+    return 0;
+}
diff --git a/scripts/wrapper_overhead.sh b/scripts/wrapper_overhead.sh
new file mode 100755
index 00000000..a01ea412
--- /dev/null
+++ b/scripts/wrapper_overhead.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+#
+# wrapper_overhead.sh — show the irreducible cost of "being a wrapper" in
+# isolation, with no header / no counters / no enable check / nothing.
+#
+# Builds two artifacts:
+#   - bin/wrapper_overhead              the hot-loop bench
+#   - bin/wrapper_passthrough.{dylib,so}  a do-nothing tail-call interposer
+#
+# Runs the bench twice:
+#   1. Plain  — user code → libc malloc.
+#   2. Wrapped — user code → tail-call wrapper → libc malloc.
+#
+# Whatever ns delta you see is the price of the extra function-call layer
+# alone. Anything you'd build on top (header, counters, enable check)
+# stacks on top of that.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="$(mktemp -d -t wrapper_overhead.XXXXXX)"
+trap 'rm -rf "$BUILD_DIR"' EXIT
+
+CC="${CC:-$(command -v clang || command -v gcc || echo cc)}"
+CFLAGS="${CFLAGS:--O2 -Wall -Wextra}"
+
+step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; }
+fail() { printf '\033[31m## %s\033[0m\n' "$*" >&2; exit 1; }
+
+# --- Build ---
+step "Building bench harness + pass-through wrapper"
+"$CC" $CFLAGS -o "$BUILD_DIR/wrapper_overhead" "$SCRIPT_DIR/wrapper_overhead.c"
+
+# Collect injection env vars in a bash array so they pass cleanly to `env`.
+declare -a INJECT_ENV=()
+case "$(uname -s)" in
+    Darwin)
+        WRAPPER_LIB="$BUILD_DIR/libwrapper_passthrough.dylib"
+        "$CC" $CFLAGS -dynamiclib -o "$WRAPPER_LIB" \
+            "$SCRIPT_DIR/wrapper_overhead_passthrough.c"
+        INJECT_ENV+=("DYLD_INSERT_LIBRARIES=$WRAPPER_LIB" "DYLD_FORCE_FLAT_NAMESPACE=1")
+        ;;
+    Linux)
+        WRAPPER_LIB="$BUILD_DIR/libwrapper_passthrough.so"
+        "$CC" $CFLAGS -fPIC -shared -o "$WRAPPER_LIB" \
+            "$SCRIPT_DIR/wrapper_overhead_passthrough.c" -ldl
+        INJECT_ENV+=("LD_PRELOAD=$WRAPPER_LIB")
+        ;;
+    *)
+        fail "Unsupported platform: $(uname -s)"
+        ;;
+esac
+
+# --- Run plain ---
+step "Run 1 — plain (no wrapper)"
+BENCH_LABEL="plain" "$BUILD_DIR/wrapper_overhead"
+
+# --- Run wrapped ---
+step "Run 2 — pass-through wrapper preloaded ($(basename "$WRAPPER_LIB"))"
+env BENCH_LABEL="wrapped" "${INJECT_ENV[@]}" "$BUILD_DIR/wrapper_overhead"
+
+# --- Run full interposer (optional) ---
+# If the caller points us at the real malloc-interposer dylib, do a third run
+# with counting enabled. Delta from run #2 is the real bookkeeping cost.
+if [[ -n "${INTERPOSER_DYLIB:-}" ]]; then
+    if [[ ! -f "$INTERPOSER_DYLIB" ]]; then
+        fail "INTERPOSER_DYLIB=$INTERPOSER_DYLIB does not exist"
+    fi
+
+    declare -a FULL_INJECT=()
+    case "$(uname -s)" in
+        Darwin)
+            FULL_INJECT+=("DYLD_INSERT_LIBRARIES=$INTERPOSER_DYLIB"
+                          "DYLD_FORCE_FLAT_NAMESPACE=1")
+            ;;
+        Linux)
+            FULL_INJECT+=("LD_PRELOAD=$INTERPOSER_DYLIB")
+            ;;
+    esac
+
+    step "Run 3 — full malloc-interposer preloaded, counting ON"
+    env BENCH_LABEL="full-interposer" "${FULL_INJECT[@]}" "$BUILD_DIR/wrapper_overhead"
+fi
+
+cat <<'EOF'
+
+Reading the output:
+  delta(plain → wrapped)     = cost of the wrapper layer alone (no logic).
+  delta(wrapped → full)      = cost of header + magic check + enable check
+                               + TLS pointer + counter writes (the
+                               "bookkeeping" on top of the wrapper).
+  delta(plain → full)        = total interposer overhead vs. raw libc.
+
+If only runs 1 and 2 appear, set INTERPOSER_DYLIB=<path> to enable run 3.
+EOF
diff --git a/scripts/wrapper_overhead_passthrough.c b/scripts/wrapper_overhead_passthrough.c
new file mode 100644
index 00000000..146dae84
--- /dev/null
+++ b/scripts/wrapper_overhead_passthrough.c
@@ -0,0 +1,152 @@
+// wrapper_overhead_passthrough.c — a bare malloc/free interposer that does
+// NOTHING beyond what an empty wrapper does. No header, no counters, no
+// enable check, no TLS, no atomics. Each replacement_* is a single-
+// instruction tail call to libc.
+//
+// Used by wrapper_overhead.sh to isolate the cost of the wrapper layer
+// itself — independent of any bookkeeping you might layer on top.
+//
+// macOS path: DYLD_INTERPOSE entries route malloc/free through us via
+// the standard __DATA,__interpose section. Internal calls to malloc/free
+// inside this dylib resolve directly to libsystem.
+//
+// Linux path: defining `malloc` / `free` in an LD_PRELOAD'd shared object
+// overrides the global symbol resolution. We forward to the real libc
+// entries via dlsym(RTLD_NEXT, …). The resolve dance is a small one-time
+// cost amortised away after warmup, so it doesn't pollute the measurement.
+
+#include <stdlib.h>
+
+#if defined(__APPLE__)
+
+#define DYLD_INTERPOSE(_replacement, _replacee)                                 \
+    __attribute__((used)) static struct {                                       \
+        const void *replacement;                                                \
+        const void *replacee;                                                   \
+    } _interpose_##_replacee __attribute__((section("__DATA,__interpose"))) = { \
+        (const void *)&_replacement, (const void *)&_replacee                   \
+    };
+
+void *replacement_malloc(size_t size)                  { return malloc(size);          }
+void  replacement_free(void *p)                        { free(p);                       }
+void *replacement_calloc(size_t n, size_t s)           { return calloc(n, s);           }
+void *replacement_realloc(void *p, size_t s)           { return realloc(p, s);          }
+void *replacement_reallocf(void *p, size_t s)          { return reallocf(p, s);         }
+void *replacement_valloc(size_t s)                     { return valloc(s);              }
+int   replacement_posix_memalign(void **m, size_t a, size_t s) {
+    return posix_memalign(m, a, s);
+}
+
+DYLD_INTERPOSE(replacement_malloc,        malloc)
+DYLD_INTERPOSE(replacement_free,          free)
+DYLD_INTERPOSE(replacement_calloc,        calloc)
+DYLD_INTERPOSE(replacement_realloc,       realloc)
+DYLD_INTERPOSE(replacement_reallocf,      reallocf)
+DYLD_INTERPOSE(replacement_valloc,        valloc)
+DYLD_INTERPOSE(replacement_posix_memalign, posix_memalign)
+
+#else  /* Linux */
+
+// On Linux we resolve the real libc functions via dlsym(RTLD_NEXT, …) and
+// cache the function pointers. The wrinkle: dlsym itself can call calloc
+// internally during symbol resolution, which would recurse back into our
+// hooks. We guard against that with a thread-local "in dlsym" flag and a
+// small static bootstrap buffer that absorbs any allocations made while
+// resolving.
+//
+// After resolution completes (which happens during the constructor, before
+// the bench's hot loop runs), the steady-state hot path is just:
+//     ldr  x_real_fn
+//     blr  x_real_fn
+// — one load, one indirect call. Same shape as glibc's own PLT stub, so
+// the wrapper-layer cost is just the extra branch.
+
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <string.h>
+
+static void *(*real_malloc)(size_t)         = NULL;
+static void  (*real_free)(void *)           = NULL;
+static void *(*real_calloc)(size_t, size_t) = NULL;
+static void *(*real_realloc)(void *, size_t)= NULL;
+
+// TLS guard: set while we're inside dlsym so any reentrant malloc/calloc/
+// realloc/free calls go to the bootstrap path instead of recursing.
+static __thread int g_in_resolve = 0;
+
+// Small static buffer for allocations made during dlsym resolution.
+// 64 KiB is more than enough — dlsym typically allocates only a handful of
+// small objects during the first call.
+static char   g_boot_mem[64 * 1024];
+static size_t g_boot_off = 0;
+
+static int boot_owns(const void *p) {
+    return (const char *)p >= g_boot_mem &&
+           (const char *)p <  g_boot_mem + sizeof(g_boot_mem);
+}
+
+static void *boot_alloc(size_t n) {
+    size_t aligned = (n + 15) & ~(size_t)15;
+    if (g_boot_off + aligned > sizeof(g_boot_mem)) return NULL;
+    void *p = g_boot_mem + g_boot_off;
+    g_boot_off += aligned;
+    return p;
+}
+
+static void resolve_real(void) {
+    g_in_resolve = 1;
+    real_malloc  = dlsym(RTLD_NEXT, "malloc");
+    real_free    = dlsym(RTLD_NEXT, "free");
+    real_calloc  = dlsym(RTLD_NEXT, "calloc");
+    real_realloc = dlsym(RTLD_NEXT, "realloc");
+    g_in_resolve = 0;
+}
+
+__attribute__((constructor)) static void preresolve(void) {
+    resolve_real();
+}
+
+void *malloc(size_t s) {
+    if (__builtin_expect(real_malloc != NULL, 1)) return real_malloc(s);
+    if (g_in_resolve) return boot_alloc(s);
+    resolve_real();
+    return real_malloc ? real_malloc(s) : boot_alloc(s);
+}
+
+void free(void *p) {
+    if (!p) return;
+    if (boot_owns(p)) return;       // bootstrap blocks have no underlying chunk
+    if (__builtin_expect(real_free != NULL, 1)) { real_free(p); return; }
+    if (g_in_resolve) return;
+    resolve_real();
+    if (real_free) real_free(p);
+}
+
+void *calloc(size_t n, size_t s) {
+    if (__builtin_expect(real_calloc != NULL, 1)) return real_calloc(n, s);
+    if (g_in_resolve) {
+        void *p = boot_alloc(n * s);
+        if (p) memset(p, 0, n * s);
+        return p;
+    }
+    resolve_real();
+    if (real_calloc) return real_calloc(n, s);
+    void *p = boot_alloc(n * s);
+    if (p) memset(p, 0, n * s);
+    return p;
+}
+
+void *realloc(void *p, size_t s) {
+    if (boot_owns(p)) {
+        // Can't realloc a bootstrap allocation in place; copy out via malloc.
+        void *np = malloc(s);
+        if (np && p) memcpy(np, p, s);
+        return np;
+    }
+    if (__builtin_expect(real_realloc != NULL, 1)) return real_realloc(p, s);
+    if (g_in_resolve) return boot_alloc(s);
+    resolve_real();
+    return real_realloc ? real_realloc(p, s) : boot_alloc(s);
+}
+
+#endif