diff --git a/.gitignore b/.gitignore index 0f356338..dba32f05 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +.DS_Store + ## User settings xcuserdata/ @@ -57,6 +60,7 @@ Package.resolved .swiftpm .DS_Store .build/ +.build-*/ # CocoaPods # diff --git a/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift b/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift new file mode 100644 index 00000000..eb0ad5f0 --- /dev/null +++ b/Benchmarks/Benchmarks/MallocInterposer/MallocInterposer.swift @@ -0,0 +1,202 @@ +// +// Copyright (c) 2026 Ordo One AB +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Regression benchmarks for the malloc interposer. Each benchmark performs +// a known, fixed number of allocations per iteration so the reported +// per-iteration counts (mallocCountTotal / freeCountTotal / etc.) line up +// with the expected values noted in the benchmark name. Drift between the +// jemalloc and interposer code paths — or between branches — shows up +// immediately as a count mismatch. +// +// Counts are scaled per iteration: with .kilo scaling, one malloc inside +// the body produces "1" in the count column, not "1000". + +import Benchmark + +#if canImport(Darwin) +import Darwin +#elseif canImport(Glibc) +import Glibc +#elseif canImport(Musl) +import Musl +#else +#error("Unsupported Platform") +#endif + +let mallocMetrics: [BenchmarkMetric] = [ + .wallClock, + .mallocCountSmall, + .mallocCountLarge, + .mallocCountTotal, + .freeCountTotal, + .mallocBytesCount, + .mallocFreeDelta, + .memoryLeakedBytes, +] + +let benchmarks: @Sendable () -> Void = { + Benchmark.defaultConfiguration = .init( + metrics: mallocMetrics, + warmupIterations: 1, + scalingFactor: .kilo, + maxDuration: .seconds(1), + maxIterations: 100 + ) + + // Sanity floor: an empty body should report (close to) zero allocations. + // Whatever the framework's per-iteration overhead is, it shows up here + // and is the reference for what "no allocations" looks like. + Benchmark("Noop") { benchmark in + for _ in benchmark.scaledIterations { + blackHole(0) + } + } + + // Bread-and-butter malloc/free pair, sub-page size — should land in + // mallocCountSmall, not mallocCountLarge. + // Expected per iter: malloc=1 (small=1, large=0), free=1, leaked=0. + Benchmark("Malloc 64B + free") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = malloc(64) + blackHole(ptr) + free(ptr) + } + } + + // Larger-than-page allocation — should land in mallocCountLarge. + // Expected per iter: malloc=1 (small=0, large=1), free=1. + Benchmark("Malloc 2 MiB + free") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = malloc(2 * 1_024 * 1_024) + blackHole(ptr) + free(ptr) + } + } + + // calloc must be counted exactly like malloc + memset. + // Expected per iter: malloc=1, free=1. + Benchmark("Calloc 8x8 + free") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = calloc(8, 8) + blackHole(ptr) + free(ptr) + } + } + + // realloc(grow) on success: implicit free of old + alloc of new. + // Expected per iter: malloc=2, free=2. + Benchmark("Realloc grow 64→256 + free") { benchmark in + for _ in benchmark.scaledIterations { + let original = malloc(64) + let grown = realloc(original, 256) + blackHole(grown) + free(grown) + } + } + + // realloc(NULL, size) is a pure malloc — no implicit free. + // Expected per iter: malloc=1, free=1. + Benchmark("Realloc(NULL, 128) + free") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = realloc(nil, 128) + blackHole(ptr) + free(ptr) + } + } + + // realloc(p, 0) frees p and returns NULL — pure free, no second malloc. + // Expected per iter: malloc=1, free=1. + Benchmark("Malloc + realloc(p, 0)") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = malloc(64) + let resized = realloc(ptr, 0) + blackHole(resized) // expected nil + } + } + + // posix_memalign — separate code path that's easy to forget to count. + // Expected per iter: malloc=1, free=1. + Benchmark("posix_memalign(64, 1024) + free") { benchmark in + var ptr: UnsafeMutableRawPointer? + for _ in benchmark.scaledIterations { + _ = posix_memalign(&ptr, 64, 1_024) + blackHole(ptr) + free(ptr) + } + } + + // C11 aligned_alloc — currently only intercepted on Linux. On Darwin the + // count drops because the symbol isn't in the DYLD_INTERPOSE list. Useful + // signal for that gap. + // Expected per iter (Linux): malloc=1, free=1. + // Expected per iter (Darwin): malloc=0 (not interposed), free=1. + #if !canImport(Darwin) + Benchmark("aligned_alloc(64, 1024) + free") { benchmark in + for _ in benchmark.scaledIterations { + let ptr = aligned_alloc(64, 1_024) + blackHole(ptr) + free(ptr) + } + } + #endif + + // Batched mallocs in a single iteration — verifies the counter scales + // linearly and isn't accidentally collapsed/de-duplicated. + // Expected per iter: malloc=16, free=16. + Benchmark("Malloc x16 + free x16") { benchmark in + let count = 16 + let buf = UnsafeMutablePointer.allocate(capacity: count) + defer { buf.deallocate() } + buf.update(repeating: nil, count: count) + + for _ in benchmark.scaledIterations { + for i in 0.. 0 and stable + // between runs. + Benchmark("Swift Array(repeating:0, count:128)") { benchmark in + for _ in benchmark.scaledIterations { + var arr = [Int](repeating: 0, count: 128) + arr.withUnsafeMutableBufferPointer { buf in + blackHole(buf.baseAddress) + } + } + } + + // Heap-allocated String (must exceed the small-string inline limit of + // 15 bytes). Same caveat as Array — count is stdlib-dependent but must + // be stable. + Benchmark("Swift String (long, heap)") { benchmark in + for _ in benchmark.scaledIterations { + let str = String(repeating: "x", count: 256) + blackHole(str) + } + } +} diff --git a/Benchmarks/Package.resolved b/Benchmarks/Package.resolved index 86e322d3..2c543468 100644 --- a/Benchmarks/Package.resolved +++ b/Benchmarks/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "f1d359a544b71b52c6788ad2e4cd2952f7f166b62ddb07316768f66be7ba4099", + "originHash" : "beddb8cb97cf892b8a2c00081488d118648e1609b6467ece2ea9cd075a22b282", "pins" : [ { "identity" : "hdrhistogram-swift", @@ -11,21 +11,21 @@ } }, { - "identity" : "package-datetime", + "identity" : "malloc-interposer", "kind" : "remoteSourceControl", - "location" : "https://github.com/ordo-one/package-datetime", + "location" : "https://github.com/ordo-one/malloc-interposer.git", "state" : { - "revision" : "d1242188c9f48aad297e6ca9b717776f8660bc31", - "version" : "1.0.2" + "revision" : "d9ca5ad6d85622fb2bd5b3d3387ba064dbcab1c2", + "version" : "1.0.0" } }, { - "identity" : "package-jemalloc", + "identity" : "package-datetime", "kind" : "remoteSourceControl", - "location" : "https://github.com/ordo-one/package-jemalloc", + "location" : "https://github.com/ordo-one/package-datetime", "state" : { - "revision" : "e8a5db026963f5bfeac842d9d3f2cc8cde323b49", - "version" : "1.0.0" + "revision" : "d1242188c9f48aad297e6ca9b717776f8660bc31", + "version" : "1.0.2" } }, { @@ -33,8 +33,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-argument-parser", "state" : { - "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", - "version" : "1.3.0" + "revision" : "6a52f3251125d74daf04fcbd5e6f08a75d074382", + "version" : "1.8.2" } }, { diff --git a/Benchmarks/Package.swift b/Benchmarks/Package.swift index 7f75a1fd..ec44ccb0 100644 --- a/Benchmarks/Package.swift +++ b/Benchmarks/Package.swift @@ -77,3 +77,20 @@ package.targets += [ ] ) ] + +// Regression coverage for the malloc interposer: predictable allocation +// patterns (counts known per iteration) so any drift between jemalloc and +// interposer code paths is immediately visible in mallocCountTotal / +// freeCountTotal / mallocFreeDelta / memoryLeakedBytes. +package.targets += [ + .executableTarget( + name: "MallocInterposerBenchmarks", + dependencies: [ + .product(name: "Benchmark", package: "benchmark") + ], + path: "Benchmarks/MallocInterposer", + plugins: [ + .plugin(name: "BenchmarkPlugin", package: "benchmark") + ] + ) +] diff --git a/Package.swift b/Package.swift index 0a48e622..71c4c543 100644 --- a/Package.swift +++ b/Package.swift @@ -1,19 +1,23 @@ -// swift-tools-version: 6.1 +// swift-tools-version: 6.3 import PackageDescription import class Foundation.ProcessInfo -// If the environment variable BENCHMARK_DISABLE_JEMALLOC is set disable Jemalloc trait (backward compatibility) -let disableJemalloc = ProcessInfo.processInfo.environment["BENCHMARK_DISABLE_JEMALLOC"] != nil - -let defaultTraits: Set - -if disableJemalloc { - defaultTraits = [] -} else { - defaultTraits = ["Jemalloc"] -} +// When MALLOC_INTERPOSER_LOCAL_PATH is set, use a local checkout of the +// malloc-interposer package instead of the published GitHub URL. Useful +// when iterating on the interposer alongside this package. +let mallocInterposerDependency: Package.Dependency = { + if let localPath = ProcessInfo.processInfo.environment["MALLOC_INTERPOSER_LOCAL_PATH"], + localPath.isEmpty == false + { + return .package(path: localPath) + } + return .package( + url: "https://github.com/ordo-one/malloc-interposer.git", + .upToNextMajor(from: "1.0.0") + ) +}() var packageDependencies: [Package.Dependency] = [ .package(url: "https://github.com/apple/swift-system.git", .upToNextMajor(from: "1.1.0")), @@ -21,12 +25,12 @@ var packageDependencies: [Package.Dependency] = [ .package(url: "https://github.com/ordo-one/TextTable.git", .upToNextMajor(from: "0.0.1")), .package(url: "https://github.com/HdrHistogram/hdrhistogram-swift.git", .upToNextMajor(from: "0.1.4")), .package(url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.0.0")), - .package(url: "https://github.com/ordo-one/package-jemalloc.git", .upToNextMajor(from: "1.0.0")), + mallocInterposerDependency, ] #if os(Linux) && compiler(>=6.3) packageDependencies += [ - .package(url: "https://github.com/ordo-one/swift-runtime-interposer.git", .upToNextMajor(from: "1.0.0")), + .package(url: "https://github.com/ordo-one/swift-runtime-interposer.git", .upToNextMajor(from: "1.0.0")) ] #endif @@ -39,13 +43,21 @@ var benchmarkDependencies: [Target.Dependency] = [ .product(name: "Atomics", package: "swift-atomics"), "SwiftRuntimeHooks", "BenchmarkShared", - .product(name: "jemalloc", package: "package-jemalloc", condition: .when(platforms: [.macOS, .linux], traits: ["Jemalloc"])), + .product(name: "MallocInterposerSwift", package: "malloc-interposer"), ] #if os(Linux) && compiler(>=6.3) benchmarkDependencies += [ - .product(name: "SwiftRuntimeInterposerC", package: "swift-runtime-interposer", condition: .when(platforms: [.linux])), - .product(name: "SwiftRuntimeInterposerSwift", package: "swift-runtime-interposer", condition: .when(platforms: [.linux])), + .product( + name: "SwiftRuntimeInterposerC", + package: "swift-runtime-interposer", + condition: .when(platforms: [.linux]) + ), + .product( + name: "SwiftRuntimeInterposerSwift", + package: "swift-runtime-interposer", + condition: .when(platforms: [.linux]) + ), ] #endif @@ -63,10 +75,6 @@ let package = Package( targets: ["Benchmark"] ), ], - traits: [ - .trait(name: "Jemalloc"), - .default(enabledTraits: defaultTraits), - ], dependencies: packageDependencies, targets: [ .target( diff --git a/Package@swift-6.2.swift b/Package@swift-6.2.swift new file mode 100644 index 00000000..2f8c1f0a --- /dev/null +++ b/Package@swift-6.2.swift @@ -0,0 +1,148 @@ +// swift-tools-version: 6.1 + +import PackageDescription + +import class Foundation.ProcessInfo + +// If the environment variable BENCHMARK_DISABLE_JEMALLOC is set disable Jemalloc trait (backward compatibility) +let disableJemalloc = ProcessInfo.processInfo.environment["BENCHMARK_DISABLE_JEMALLOC"] != nil + +let defaultTraits: Set + +if disableJemalloc { + defaultTraits = [] +} else { + defaultTraits = ["Jemalloc"] +} + +let package = Package( + name: "Benchmark", + platforms: [ + .macOS(.v13), + .iOS(.v16), + ], + products: [ + .plugin(name: "BenchmarkCommandPlugin", targets: ["BenchmarkCommandPlugin"]), + .plugin(name: "BenchmarkPlugin", targets: ["BenchmarkPlugin"]), + .library( + name: "Benchmark", + targets: ["Benchmark"] + ), + ], + traits: [ + .trait(name: "Jemalloc"), + .default(enabledTraits: defaultTraits), + ], + dependencies: [ + .package(url: "https://github.com/apple/swift-system.git", .upToNextMajor(from: "1.1.0")), + .package(url: "https://github.com/apple/swift-argument-parser.git", "1.1.0"..<"1.6.0"), + .package(url: "https://github.com/ordo-one/TextTable.git", .upToNextMajor(from: "0.0.1")), + .package(url: "https://github.com/HdrHistogram/hdrhistogram-swift.git", .upToNextMajor(from: "0.1.4")), + .package(url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.0.0")), + .package(url: "https://github.com/ordo-one/package-jemalloc.git", .upToNextMajor(from: "1.0.0")), + ], + targets: [ + .target( + name: "Benchmark", + dependencies: [ + .product(name: "Histogram", package: "hdrhistogram-swift"), + .product(name: "ArgumentParser", package: "swift-argument-parser"), + .product(name: "SystemPackage", package: "swift-system"), + .byNameItem(name: "CDarwinOperatingSystemStats", condition: .when(platforms: [.macOS, .iOS])), + .byNameItem(name: "CLinuxOperatingSystemStats", condition: .when(platforms: [.linux])), + .product(name: "Atomics", package: "swift-atomics"), + "SwiftRuntimeHooks", + "BenchmarkShared", + .product( + name: "jemalloc", package: "package-jemalloc", condition: .when(platforms: [.macOS, .linux], traits: ["Jemalloc"])), + ], + swiftSettings: [.swiftLanguageMode(.v5)] + ), + // Plugins used by users of the package + + // The actual 'benchmark' command plugin + .plugin( + name: "BenchmarkCommandPlugin", + capability: .command( + intent: .custom( + verb: "benchmark", + description: "Run the Benchmark performance test suite." + ) + ), + dependencies: [ + "BenchmarkTool" + ], + path: "Plugins/BenchmarkCommandPlugin" + ), + + // Plugin that generates the boilerplate needed to interface with the Benchmark infrastructure + .plugin( + name: "BenchmarkPlugin", + capability: .buildTool(), + dependencies: [ + "BenchmarkBoilerplateGenerator" + ], + path: "Plugins/BenchmarkPlugin" + ), + + // Tool that the plugin executes to perform the actual work, the real benchmark driver + .executableTarget( + name: "BenchmarkTool", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + .product(name: "SystemPackage", package: "swift-system"), + .product(name: "TextTable", package: "TextTable"), + "Benchmark", + "BenchmarkShared", + ], + path: "Plugins/BenchmarkTool", + swiftSettings: [.swiftLanguageMode(.v5)] + ), + + // Tool that generates the boilerplate + .executableTarget( + name: "BenchmarkBoilerplateGenerator", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + .product(name: "SystemPackage", package: "swift-system"), + ], + path: "Plugins/BenchmarkBoilerplateGenerator" + ), + + // Tool that simply generates the man page for the BenchmarkPlugin as we can't use SAP in it... :-/ + .executableTarget( + name: "BenchmarkHelpGenerator", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + "BenchmarkShared", + ], + path: "Plugins/BenchmarkHelpGenerator" + ), + + // Getting OS specific information + .target( + name: "CDarwinOperatingSystemStats", + dependencies: [], + path: "Platform/CDarwinOperatingSystemStats" + ), + + // Getting OS specific information + .target( + name: "CLinuxOperatingSystemStats", + dependencies: [], + path: "Platform/CLinuxOperatingSystemStats" + ), + + // Hooks for ARC + .target(name: "SwiftRuntimeHooks"), + + // Shared definitions + .target(name: "BenchmarkShared"), + + .testTarget( + name: "BenchmarkTests", + dependencies: ["Benchmark"], + swiftSettings: [.swiftLanguageMode(.v5)] + ), + ] +) diff --git a/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift b/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift index 22650d62..ff7f7ab2 100644 --- a/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift +++ b/Plugins/BenchmarkCommandPlugin/BenchmarkCommandPlugin.swift @@ -11,6 +11,7 @@ // 'Benchmark' plugin that is responsible for gathering command line arguments and then // Running the `BenchmarkTool` for each benchmark target. +@preconcurrency import Foundation import PackagePlugin @preconcurrency import Foundation @@ -174,6 +175,7 @@ import PackagePlugin let packageBenchmarkIdentifiers: Set = ["benchmark", "package-benchmark"] let benchmarkToolName = "BenchmarkTool" let benchmarkTool: PackagePlugin.Path // = try context.tool(named: benchmarkToolName) + let interposerLib: String // Resolve which identifier this consumer actually has the benchmark package under, // so generated boilerplate matches what SPM sees (depends on whether they pinned @@ -419,10 +421,7 @@ import PackagePlugin } // Build the BenchmarkTool manually in release mode to work around https://github.com/apple/swift-package-manager/issues/7210 - guard - let benchmarkToolModule = benchmarkToolModuleTargets.first(where: { - $0.kind == .executable && $0.name == benchmarkToolName - }) + guard let benchmarkToolModule = benchmarkToolModuleTargets.first(where: { $0.kind == .executable && $0.name == benchmarkToolName }) else { print("Benchmark failed to find the BenchmarkTool target.") throw MyError.buildFailed @@ -457,6 +456,7 @@ import PackagePlugin } benchmarkTool = tool.path + interposerLib = tool.path.removingLastComponent().appending(subpath: "libMallocInterposerSwift.so").string #if os(Linux) && compiler(>=6.3) let swiftRuntimeInterposerLib = tool.path.removingLastComponent() .appending(subpath: "libSwiftRuntimeInterposerC.so").string @@ -542,6 +542,8 @@ import PackagePlugin return } + // On Linux we need to set LD_PRELOAD to get the malloc interposer working + // while on Darwin this is done with DYLD interpose mechanism #if os(Linux) && compiler(>=6.3) if shouldEmitRuntimeInterposerWarning(outputFormat: outputFormat, exportPath: exportPath) { writeToStderr( @@ -551,9 +553,9 @@ import PackagePlugin var environment = ProcessInfo.processInfo.environment if let existingPreload = environment["LD_PRELOAD"], existingPreload.isEmpty == false { - environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(existingPreload)" + environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(interposerLib):\(existingPreload)" } else { - environment["LD_PRELOAD"] = swiftRuntimeInterposerLib + environment["LD_PRELOAD"] = "\(swiftRuntimeInterposerLib):\(interposerLib)" } let envp = environment.map { "\($0.key)=\($0.value)" }.compactMap { $0.withCString(strdup) } + [nil] diff --git a/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift b/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift index 8c91fac0..f09c4c69 100644 --- a/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift +++ b/Plugins/BenchmarkCommandPlugin/BenchmarkPlugin+Help.swift @@ -53,8 +53,9 @@ let help = Benchmark targets matching the regexp filter that should be skipped --format The output format to use, default is 'text' (values: text, markdown, influx, jmh, jsonSmallerIsBetter, jsonBiggerIsBetter, histogramEncoded, histogram, histogramSamples, histogramPercentiles, metricP90AbsoluteThresholds) --metric Specifies that the benchmark run should use one or more specific metrics instead of the ones defined by the benchmarks. (values: cpuUser, cpuSystem, cpuTotal, wallClock, throughput, - peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, allocatedResidentMemory, memoryLeaked, syscalls, contextSwitches, threads, - threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom) + peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, freeCountTotal, mallocBytesCount, mallocFreeDelta, + allocatedResidentMemory, memoryLeaked, memoryLeakedBytes, syscalls, contextSwitches, threads, threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, + readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom) --path The path to operate on for data export or threshold operations, default is the current directory (".") for exports and the ("./Thresholds") directory for thresholds. --quiet Specifies that output should be suppressed (useful for if you just want to check return code) --scale Specifies that some of the text output should be scaled using the scalingFactor (denoted by '*' in output) diff --git a/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift b/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift index 979daee2..5cc9920b 100644 --- a/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift +++ b/Plugins/BenchmarkHelpGenerator/BenchmarkHelpGenerator.swift @@ -26,8 +26,12 @@ let availableMetrics = [ "mallocCountSmall", "mallocCountLarge", "mallocCountTotal", + "freeCountTotal", + "mallocBytesCount", + "mallocFreeDelta", "allocatedResidentMemory", "memoryLeaked", + "memoryLeakedBytes", "syscalls", "contextSwitches", "threads", diff --git a/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift b/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift index 5529d71f..79e3f678 100644 --- a/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift +++ b/Plugins/BenchmarkTool/BenchmarkTool+Operations.swift @@ -31,7 +31,7 @@ extension BenchmarkTool { let benchmarkReply = try read() switch benchmarkReply { - case let .list(benchmark): + case .list(let benchmark): benchmark.executablePath = benchmarkPath benchmark.target = FilePath(benchmarkPath).lastComponent!.description if metrics.isEmpty == false { @@ -40,7 +40,7 @@ extension BenchmarkTool { benchmarks.append(benchmark) case .end: break outerloop - case let .error(description): + case .error(let description): failBenchmark(description) break outerloop default: @@ -57,12 +57,12 @@ extension BenchmarkTool { let benchmarkReply = try read() switch benchmarkReply { - case let .result(benchmark: benchmark, results: results): + case .result(benchmark: let benchmark, results: let results): let filteredResults = results.filter { benchmark.configuration.metrics.contains($0.metric) } benchmarkResults[BenchmarkIdentifier(target: target, name: benchmark.name)] = filteredResults case .end: break outerloop - case let .error(description): + case .error(let description): failBenchmark(description, exitCode: .benchmarkJobFailed, "\(target)/\(benchmark.name)") benchmarkResults[BenchmarkIdentifier(target: target, name: benchmark.name)] = [] diff --git a/Sources/Benchmark/BenchmarkExecutor+Extensions.swift b/Sources/Benchmark/BenchmarkExecutor+Extensions.swift index 559741f2..1a9d54fe 100644 --- a/Sources/Benchmark/BenchmarkExecutor+Extensions.swift +++ b/Sources/Benchmark/BenchmarkExecutor+Extensions.swift @@ -23,22 +23,66 @@ extension BenchmarkExecutor { extension BenchmarkExecutor { func mallocStatsProducerNeeded(_ metric: BenchmarkMetric) -> Bool { switch metric { - case .mallocCountLarge: - return true case .memoryLeaked: + #if canImport(MallocInterposerSwift) + return false + #else return true - case .mallocCountSmall: + #endif + case .memoryLeakedBytes: return true + case .mallocFreeDelta: + #if canImport(MallocInterposerSwift) + return true + #else + return false + #endif case .mallocCountTotal: return true + case .mallocCountSmall: + return true + case .mallocCountLarge: + return true + case .mallocBytesCount: + return true case .allocatedResidentMemory: return true + case .freeCountTotal: + return true default: return false } } } +extension BenchmarkExecutor { + /// Maps a measured window's interposer counter deltas to the `(metric, value)` pairs to record. + /// + /// Extracted as a pure function so the leak/scaling arithmetic can be unit-tested without a live + /// interposer. `mallocFreeDelta` / `memoryLeakedBytes` are clamped to `0`: a net-negative window + /// (more frees than mallocs — e.g. freeing a warmup survivor, or cross-thread frees) is not a + /// leak, and clamping records a `0` sample rather than letting `Statistics.add` drop it, which + /// would desync the column's sample count and bias the average upward. + static func mallocStatistics( // swiftlint:disable:this function_parameter_count + mallocCountDelta: Int, + mallocBytesDelta: Int, + mallocSmallDelta: Int, + mallocLargeDelta: Int, + freeCountDelta: Int, + freeBytesDelta: Int + ) -> [(metric: BenchmarkMetric, value: Int)] { + [ + (.mallocCountTotal, mallocCountDelta), + (.mallocBytesCount, mallocBytesDelta), + (.mallocCountSmall, mallocSmallDelta), + (.mallocCountLarge, mallocLargeDelta), + (.freeCountTotal, freeCountDelta), + (.mallocFreeDelta, max(0, mallocCountDelta - freeCountDelta)), + (.memoryLeakedBytes, max(0, mallocBytesDelta - freeBytesDelta)), + ] + } +} + extension BenchmarkExecutor { func operatingSystemsStatsProducerNeeded(_ metric: BenchmarkMetric) -> Bool { switch metric { diff --git a/Sources/Benchmark/BenchmarkExecutor.swift b/Sources/Benchmark/BenchmarkExecutor.swift index 99e07086..ead3edc1 100644 --- a/Sources/Benchmark/BenchmarkExecutor.swift +++ b/Sources/Benchmark/BenchmarkExecutor.swift @@ -8,6 +8,10 @@ // http://www.apache.org/licenses/LICENSE-2.0 // +#if canImport(MallocInterposerSwift) +import MallocInterposerSwift +#endif + #if canImport(OSLog) import OSLog #endif @@ -25,8 +29,13 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length // swiftlint:disable cyclomatic_complexity function_body_length func run(_ benchmark: Benchmark) -> [BenchmarkResult] { var wallClockDuration: Duration = .zero + #if canImport(MallocInterposerSwift) + var startMallocStats = MallocInterposerSwift.Statistics() + var stopMallocStats = MallocInterposerSwift.Statistics() + #else var startMallocStats = MallocStats() var stopMallocStats = MallocStats() + #endif var startOperatingSystemStats = OperatingSystemStats() var stopOperatingSystemStats = OperatingSystemStats() var startPerformanceCounters = PerformanceCounters() @@ -106,9 +115,6 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length var iterations = 0 let initialStartTime = BenchmarkClock.now - // 'Warmup' to remove initial mallocs from stats in p100 - _ = MallocStatsProducer.makeMallocStats() // baselineMallocStats - // Calculate typical sys call check overhead and deduct that to get 'clean' stats for the actual benchmark var operatingSystemStatsOverhead = OperatingSystemStats() var baselinePeakMemoryResidentDelta = 0 @@ -154,7 +160,11 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length #endif if mallocStatsRequested { + #if canImport(MallocInterposerSwift) + startMallocStats = MallocInterposerSwift.getStatistics() + #else startMallocStats = MallocStatsProducer.makeMallocStats() + #endif } if arcStatsRequested { @@ -191,7 +201,11 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length } if mallocStatsRequested { + #if canImport(MallocInterposerSwift) + stopMallocStats = MallocInterposerSwift.getStatistics() + #else stopMallocStats = MallocStatsProducer.makeMallocStats() + #endif } #if canImport(OSLog) @@ -239,21 +253,43 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length } if mallocStatsRequested { - delta = stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal - statistics[BenchmarkMetric.mallocCountTotal.index].add(Int(delta)) + #if canImport(MallocInterposerSwift) + // allocatedResidentMemory and the legacy memoryLeaked metric are intentionally + // not populated on the interposer path: the interposer cannot measure the + // allocator's resident set. Use mallocBytesCount / memoryLeakedBytes for + // requested-byte accounting, mallocFreeDelta for allocation-count delta, or + // peakMemoryResident for OS-sampled resident memory. The leak/scaling arithmetic + // lives in BenchmarkExecutor.mallocStatistics(...) so it can be unit-tested + // without a live interposer. + let mallocMetrics = BenchmarkExecutor.mallocStatistics( + mallocCountDelta: stopMallocStats.mallocCount - startMallocStats.mallocCount, + mallocBytesDelta: stopMallocStats.mallocBytesCount - startMallocStats.mallocBytesCount, + mallocSmallDelta: stopMallocStats.mallocSmallCount - startMallocStats.mallocSmallCount, + mallocLargeDelta: stopMallocStats.mallocLargeCount - startMallocStats.mallocLargeCount, + freeCountDelta: stopMallocStats.freeCount - startMallocStats.freeCount, + freeBytesDelta: stopMallocStats.freeBytesCount - startMallocStats.freeBytesCount + ) + for (metric, value) in mallocMetrics { + statistics[metric.index].add(value) + } + #else + let mallocCountTotal = stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal + statistics[BenchmarkMetric.mallocCountTotal.index].add(mallocCountTotal) - delta = stopMallocStats.mallocCountSmall - startMallocStats.mallocCountSmall - statistics[BenchmarkMetric.mallocCountSmall.index].add(Int(delta)) + let allocatedResidentMemory = stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory + statistics[BenchmarkMetric.allocatedResidentMemory.index].add(allocatedResidentMemory) - delta = stopMallocStats.mallocCountLarge - startMallocStats.mallocCountLarge - statistics[BenchmarkMetric.mallocCountLarge.index].add(Int(delta)) + // jemalloc has no free counter, so memoryLeaked keeps the legacy resident-byte + // growth definition. The interposer backend uses mallocFreeDelta for + // malloc-minus-free count and memoryLeakedBytes for requested-byte delta. + statistics[BenchmarkMetric.memoryLeaked.index].add(max(0, allocatedResidentMemory)) - delta = stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory - statistics[BenchmarkMetric.memoryLeaked.index].add(Int(delta)) + let mallocSmallCount = stopMallocStats.mallocCountSmall - startMallocStats.mallocCountSmall + statistics[BenchmarkMetric.mallocCountSmall.index].add(mallocSmallCount) - // delta = stopMallocStats.allocatedResidentMemory - baselineMallocStats.allocatedResidentMemory // baselineMallocStats! - statistics[BenchmarkMetric.allocatedResidentMemory.index] - .add(Int(stopMallocStats.allocatedResidentMemory)) + let mallocLargeCount = stopMallocStats.mallocCountLarge - startMallocStats.mallocCountLarge + statistics[BenchmarkMetric.mallocCountLarge.index].add(mallocLargeCount) + #endif } if operatingSystemStatsRequested { @@ -335,6 +371,12 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length ARCStatsProducer.hook() } + if mallocStatsRequested { + #if canImport(MallocInterposerSwift) + MallocInterposerSwift.hook() + #endif + } + if benchmark.configuration.metrics.contains(.threads) || benchmark.configuration.metrics.contains(.threadsRunning) || benchmark.configuration.metrics.contains(.peakMemoryResident) @@ -425,6 +467,12 @@ struct BenchmarkExecutor { // swiftlint:disable:this type_body_length ARCStatsProducer.unhook() } + if mallocStatsRequested { + #if canImport(MallocInterposerSwift) + MallocInterposerSwift.unhook() + #endif + } + #if canImport(OSLog) signPost.endInterval("Benchmark", benchmarkInterval, "\(iterations)") #endif diff --git a/Sources/Benchmark/BenchmarkMetric+Defaults.swift b/Sources/Benchmark/BenchmarkMetric+Defaults.swift index 6ec46b88..8828adbc 100644 --- a/Sources/Benchmark/BenchmarkMetric+Defaults.swift +++ b/Sources/Benchmark/BenchmarkMetric+Defaults.swift @@ -30,43 +30,84 @@ public extension BenchmarkMetric { /// There is also an convenience extension on Array defined such that you can write just `.default` rather than `BenchmarkMetric.default` /// static var `default`: [BenchmarkMetric] { - [ + var metrics: [BenchmarkMetric] = [ .wallClock, .cpuTotal, .mallocCountTotal, + ] + #if canImport(MallocInterposerSwift) + metrics += [ + .freeCountTotal, + .mallocBytesCount, + .mallocFreeDelta, + .memoryLeakedBytes, + ] + #else + metrics += [ + .memoryLeaked, + ] + #endif + metrics += [ .throughput, .instructions, .peakMemoryResident, ] + return metrics } /// A collection of extended system benchmarks. static var extended: [BenchmarkMetric] { - [ + var metrics: [BenchmarkMetric] = [ .wallClock, .cpuUser, .cpuTotal, .mallocCountTotal, + ] + #if canImport(MallocInterposerSwift) + metrics += [ + .freeCountTotal, + .mallocBytesCount, + .mallocFreeDelta, + .memoryLeakedBytes, + ] + #else + metrics += [ + .memoryLeaked, + ] + #endif + metrics += [ .throughput, .peakMemoryResident, - .memoryLeaked, .syscalls, .instructions, ] + return metrics } /// A collection of memory benchmarks. static var memory: [BenchmarkMetric] { - [ + var metrics: [BenchmarkMetric] = [ .peakMemoryResident, .peakMemoryResidentDelta, .peakMemoryVirtual, .mallocCountSmall, .mallocCountLarge, .mallocCountTotal, + ] + #if canImport(MallocInterposerSwift) + metrics += [ + .mallocBytesCount, + .freeCountTotal, + .mallocFreeDelta, + .memoryLeakedBytes, + ] + #else + metrics += [ .memoryLeaked, .allocatedResidentMemory, ] + #endif + return metrics } /// A collection of ARC metrics @@ -117,7 +158,11 @@ public extension BenchmarkMetric { .mallocCountSmall, .mallocCountLarge, .mallocCountTotal, + .freeCountTotal, + .mallocBytesCount, + .mallocFreeDelta, .memoryLeaked, + .memoryLeakedBytes, .syscalls, .contextSwitches, .threads, diff --git a/Sources/Benchmark/BenchmarkMetric.swift b/Sources/Benchmark/BenchmarkMetric.swift index b5d06096..13ea5128 100644 --- a/Sources/Benchmark/BenchmarkMetric.swift +++ b/Sources/Benchmark/BenchmarkMetric.swift @@ -32,16 +32,43 @@ public enum BenchmarkMetric: Hashable, Equatable, Codable, CustomStringConvertib /// Measure virtual memory usage - sampled during runtime case peakMemoryVirtual /// Number of small malloc calls + /// + /// The small/large split is backend-dependent: the jemalloc backend (Swift ≤6.2) splits on + /// jemalloc's size classes, while the 6.3+ interposer backend splits on a coarser + /// `requested size > page size` threshold. case mallocCountSmall /// Number of large malloc calls + /// + /// The backend-specific counterpart to ``mallocCountSmall``. case mallocCountLarge - /// Number of small+large mallocs + /// Number of total malloc calls case mallocCountTotal + /// Number of total free calls + case freeCountTotal + /// The amount of memory allocated in bytes through malloc calls + case mallocBytesCount + /// Net unfreed allocation count within the measured region. + /// + /// Reports `malloc` count minus `free` count from the interposer backend. Because counting is + /// process-global, this metric is only reliable for single-threaded benchmarks with quiescent + /// background allocation. + case mallocFreeDelta /// The amount of allocated resident memory according to the memory allocator - /// by the application (does not include metadata overhead etc) + /// by the application (does not include metadata overhead etc). + /// + /// > Deprecated: Only produced by the jemalloc backend (Swift ≤6.2). The 6.3+ + /// > interposer backend does not measure resident memory — use ``mallocBytesCount`` + /// > for gross allocated bytes, or ``peakMemoryResident`` for OS-sampled resident memory. + @available(*, deprecated, message: "Only produced by the jemalloc backend; use mallocBytesCount or peakMemoryResident") case allocatedResidentMemory - /// Number of small+large mallocs - small+large frees in resident memory + /// Legacy jemalloc resident-byte growth within the measured region. + /// + /// Only produced by the jemalloc backend (Swift ≤6.2). The 6.3+ interposer backend does not + /// produce this metric; use ``mallocFreeDelta`` for allocation-count delta or + /// ``memoryLeakedBytes`` for requested-byte delta. case memoryLeaked + /// Net unfreed requested bytes within the measured region. + case memoryLeakedBytes /// Measure number of syscalls made during the test case syscalls /// Measure number of context switches made during the test @@ -120,7 +147,8 @@ public extension BenchmarkMetric { switch self { case .cpuSystem, .cpuTotal, .cpuUser, .wallClock: return true - case .mallocCountLarge, .mallocCountSmall, .mallocCountTotal, .memoryLeaked: + case .mallocCountSmall, .mallocCountLarge, .mallocCountTotal, .freeCountTotal, .mallocFreeDelta, + .mallocBytesCount, .memoryLeaked, .memoryLeakedBytes: return true case .syscalls: return true @@ -132,7 +160,7 @@ public extension BenchmarkMetric { return true case .objectAllocCount, .retainCount, .releaseCount, .retainReleaseDelta: return true - case let .custom(_, _, useScaleFactor): + case .custom(_, _, let useScaleFactor): return useScaleFactor default: return false @@ -144,7 +172,7 @@ public extension BenchmarkMetric { switch self { case .throughput: return .prefersLarger - case let .custom(_, polarity, _): + case .custom(_, let polarity, _): return polarity default: return .prefersSmaller @@ -175,10 +203,16 @@ public extension BenchmarkMetric { return "Malloc (large)" case .mallocCountTotal: return "Malloc (total)" + case .mallocBytesCount: + return "Malloc (bytes total)" + case .mallocFreeDelta: + return "Malloc / free Δ" case .allocatedResidentMemory: return "Memory (allocated resident)" case .memoryLeaked: - return "Malloc / free Δ" + return "Memory leaked (resident)" + case .memoryLeakedBytes: + return "Malloc / free Δ (bytes)" case .syscalls: return "Syscalls (total)" case .contextSwitches: @@ -213,8 +247,10 @@ public extension BenchmarkMetric { return "Δ" case .deltaPercentage: return "Δ %" - case let .custom(name, _, _): + case .custom(let name, _, _): return name + case .freeCountTotal: + return "Free (total)" } } @@ -244,47 +280,55 @@ public extension BenchmarkMetric { return 10 case .mallocCountTotal: return 11 - case .allocatedResidentMemory: + case .freeCountTotal: return 12 - case .memoryLeaked: + case .mallocBytesCount: return 13 - case .syscalls: + case .allocatedResidentMemory: return 14 - case .contextSwitches: + case .memoryLeaked: return 15 - case .threads: + case .memoryLeakedBytes: return 16 - case .threadsRunning: + case .syscalls: return 17 - case .readSyscalls: + case .contextSwitches: return 18 - case .writeSyscalls: + case .threads: return 19 - case .readBytesLogical: + case .threadsRunning: return 20 - case .writeBytesLogical: + case .readSyscalls: return 21 - case .readBytesPhysical: + case .writeSyscalls: return 22 - case .writeBytesPhysical: + case .readBytesLogical: return 23 - case .objectAllocCount: + case .writeBytesLogical: return 24 - case .retainCount: + case .readBytesPhysical: return 25 - case .releaseCount: + case .writeBytesPhysical: return 26 - case .retainReleaseDelta: + case .objectAllocCount: return 27 - case .instructions: + case .retainCount: return 28 + case .releaseCount: + return 29 + case .retainReleaseDelta: + return 30 + case .instructions: + return 31 + case .mallocFreeDelta: + return 32 default: return 0 // custom payloads must be stored in dictionary } } @_documentation(visibility: internal) - static var maxIndex: Int { 28 } // + static var maxIndex: Int { 32 } // // Used by the Benchmark Executor for efficient indexing into results @_documentation(visibility: internal) @@ -313,39 +357,47 @@ public extension BenchmarkMetric { case 11: return .mallocCountTotal case 12: - return .allocatedResidentMemory + return .freeCountTotal case 13: - return .memoryLeaked + return .mallocBytesCount case 14: - return .syscalls + return .allocatedResidentMemory case 15: - return .contextSwitches + return .memoryLeaked case 16: - return .threads + return .memoryLeakedBytes case 17: - return .threadsRunning + return .syscalls case 18: - return .readSyscalls + return .contextSwitches case 19: - return .writeSyscalls + return .threads case 20: - return .readBytesLogical + return .threadsRunning case 21: - return .writeBytesLogical + return .readSyscalls case 22: - return .readBytesPhysical + return .writeSyscalls case 23: - return .writeBytesPhysical + return .readBytesLogical case 24: - return .objectAllocCount + return .writeBytesLogical case 25: - return .retainCount + return .readBytesPhysical case 26: - return .releaseCount + return .writeBytesPhysical case 27: - return .retainReleaseDelta + return .objectAllocCount case 28: + return .retainCount + case 29: + return .releaseCount + case 30: + return .retainReleaseDelta + case 31: return .instructions + case 32: + return .mallocFreeDelta default: break } @@ -379,10 +431,18 @@ public extension BenchmarkMetric { return "mallocCountLarge" case .mallocCountTotal: return "mallocCountTotal" + case .freeCountTotal: + return "freeCountTotal" + case .mallocBytesCount: + return "mallocBytesCount" + case .mallocFreeDelta: + return "mallocFreeDelta" case .allocatedResidentMemory: return "allocatedResidentMemory" case .memoryLeaked: return "memoryLeaked" + case .memoryLeakedBytes: + return "memoryLeakedBytes" case .syscalls: return "syscalls" case .contextSwitches: @@ -417,7 +477,7 @@ public extension BenchmarkMetric { return "Δ" case .deltaPercentage: return "Δ %" - case let .custom(name, _, _): + case .custom(let name, _, _): return name } } @@ -451,10 +511,18 @@ public extension BenchmarkMetric { self = BenchmarkMetric.mallocCountLarge case "mallocCountTotal": self = BenchmarkMetric.mallocCountTotal + case "freeCountTotal": + self = BenchmarkMetric.freeCountTotal + case "mallocBytesCount": + self = BenchmarkMetric.mallocBytesCount + case "mallocFreeDelta": + self = BenchmarkMetric.mallocFreeDelta case "allocatedResidentMemory": self = BenchmarkMetric.allocatedResidentMemory case "memoryLeaked": self = BenchmarkMetric.memoryLeaked + case "memoryLeakedBytes": + self = BenchmarkMetric.memoryLeakedBytes case "syscalls": self = BenchmarkMetric.syscalls case "contextSwitches": diff --git a/Sources/Benchmark/BenchmarkRunner.swift b/Sources/Benchmark/BenchmarkRunner.swift index f4b7c2b0..99d1171c 100644 --- a/Sources/Benchmark/BenchmarkRunner.swift +++ b/Sources/Benchmark/BenchmarkRunner.swift @@ -10,6 +10,9 @@ import ArgumentParser import BenchmarkShared +#if canImport(MallocInterposerSwift) +import MallocInterposerSwift +#endif #if os(Linux) && compiler(>=6.3) && canImport(SwiftRuntimeInterposerSwift) import SwiftRuntimeInterposerSwift #endif @@ -117,6 +120,9 @@ public struct BenchmarkRunner: AsyncParsableCommand, BenchmarkRunnerReadWrite { var debugIterator = Benchmark.benchmarks.makeIterator() var benchmarkCommand: BenchmarkCommandRequest + #if canImport(MallocInterposerSwift) + MallocInterposerSwift.initialize() + #endif #if os(Linux) && compiler(>=6.3) && canImport(SwiftRuntimeInterposerSwift) SwiftRuntimeInterposerSwift.initialize() #endif @@ -155,7 +161,7 @@ public struct BenchmarkRunner: AsyncParsableCommand, BenchmarkRunnerReadWrite { } try channel.write(.end) - case let .run(benchmarkToRun): + case .run(let benchmarkToRun): benchmark = Benchmark.benchmarks.first { $0.name == benchmarkToRun.name } if let benchmark { diff --git a/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md b/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md index ad9a7ab2..5413bfde 100644 --- a/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md +++ b/Sources/Benchmark/Documentation.docc/BenchmarkMetric.md @@ -27,6 +27,10 @@ - ``BenchmarkMetric/wallClock`` - ``BenchmarkMetric/cpuTotal`` - ``BenchmarkMetric/mallocCountTotal`` +- ``BenchmarkMetric/freeCountTotal`` +- ``BenchmarkMetric/mallocBytesCount`` +- ``BenchmarkMetric/mallocFreeDelta`` +- ``BenchmarkMetric/memoryLeakedBytes`` - ``BenchmarkMetric/throughput`` - ``BenchmarkMetric/peakMemoryResident`` - ``BenchmarkMetric/memoryLeaked`` @@ -41,7 +45,11 @@ - ``BenchmarkMetric/mallocCountSmall`` - ``BenchmarkMetric/mallocCountLarge`` - ``BenchmarkMetric/mallocCountTotal`` +- ``BenchmarkMetric/freeCountTotal`` +- ``BenchmarkMetric/mallocBytesCount`` +- ``BenchmarkMetric/mallocFreeDelta`` - ``BenchmarkMetric/memoryLeaked`` +- ``BenchmarkMetric/memoryLeakedBytes`` - ``BenchmarkMetric/allocatedResidentMemory`` ### Reference Counting (retain/release) diff --git a/Sources/Benchmark/Documentation.docc/Metrics.md b/Sources/Benchmark/Documentation.docc/Metrics.md index a5b790bf..71ce4ad1 100644 --- a/Sources/Benchmark/Documentation.docc/Metrics.md +++ b/Sources/Benchmark/Documentation.docc/Metrics.md @@ -18,11 +18,15 @@ Currently supported metrics are: - term `peakMemoryResident`: The resident memory usage - sampled during runtime - term `peakMemoryResidentDelta`: The resident memory usage - sampled during runtime (excluding start of benchmark baseline) - term `peakMemoryVirtual`: The virtual memory usage - sampled during runtime -- term `mallocCountSmall`: The number of small malloc calls according to jemalloc -- term `mallocCountLarge`: The number of large malloc calls according to jemalloc -- term `mallocCountTotal`: The total number of mallocs according to jemalloc +- term `mallocCountSmall`: The number of small malloc calls according to the active malloc backend +- term `mallocCountLarge`: The number of large malloc calls according to the active malloc backend +- term `mallocCountTotal`: The total number of malloc calls according to the active malloc backend +- term `freeCountTotal`: The total number of free calls according to the interposer backend +- term `mallocBytesCount`: The total requested bytes allocated through malloc calls according to the interposer backend +- term `mallocFreeDelta`: The number of malloc calls minus free calls according to the interposer backend - term `allocatedResidentMemory`: The amount of allocated resident memory by the application (not including allocator metadata overhead etc) according to jemalloc -- term `memoryLeaked`: The number of small+large mallocs - small+large frees in resident memory (just a possible leak) +- term `memoryLeaked`: Legacy jemalloc resident-byte growth within the measured region +- term `memoryLeakedBytes`: The requested bytes allocated minus requested bytes freed according to the interposer backend - term `syscalls`: The number of syscalls made during the test -- macOS only - term `contextSwitches`: The number of context switches made during the test -- macOS only - term `threads`: The maximum number of threads in the process under the test (not exact, sampled) diff --git a/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md b/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md index cb48bffc..45482407 100644 --- a/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md +++ b/Sources/Benchmark/Documentation.docc/RunningBenchmarks.md @@ -91,8 +91,9 @@ OPTIONS: Benchmark targets matching the regexp filter that should be skipped --format The output format to use, default is 'text' (values: text, markdown, influx, jmh, histogramEncoded, histogram, histogramSamples, histogramPercentiles, metricP90AbsoluteThresholds) --metric Specifies that the benchmark run should use one or more specific metrics instead of the ones defined by the benchmarks. (values: cpuUser, cpuSystem, cpuTotal, wallClock, throughput, -peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, allocatedResidentMemory, memoryLeaked, syscalls, contextSwitches, threads, -threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions, retainCount, releaseCount, retainReleaseDelta, custom) +peakMemoryResident, peakMemoryResidentDelta, peakMemoryVirtual, mallocCountSmall, mallocCountLarge, mallocCountTotal, freeCountTotal, mallocBytesCount, mallocFreeDelta, allocatedResidentMemory, +memoryLeaked, memoryLeakedBytes, syscalls, contextSwitches, threads, threadsRunning, readSyscalls, writeSyscalls, readBytesLogical, writeBytesLogical, readBytesPhysical, writeBytesPhysical, instructions, +retainCount, releaseCount, retainReleaseDelta, custom) --path The path to operate on for data export or threshold operations, default is the current directory (".") for exports and the ("./Thresholds") directory for thresholds. --quiet Specifies that output should be suppressed (useful for if you just want to check return code) --scale Specifies that some of the text output should be scaled using the scalingFactor (denoted by '*' in output) diff --git a/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift b/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift index 38e34761..ed2dd8de 100644 --- a/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift +++ b/Sources/Benchmark/MallocStats/MallocStats+jemalloc-support.swift @@ -15,6 +15,8 @@ // let optionString = "J" // malloc_stats_print(nil, nil, optionString) +#if canImport(jemalloc) + // MARK: - Pokedex struct Pokedex: Codable { @@ -361,3 +363,5 @@ struct MergedLextent: Codable { } // swiftlint:enable all + +#endif diff --git a/Tests/BenchmarkTests/BenchmarkMetricsTests.swift b/Tests/BenchmarkTests/BenchmarkMetricsTests.swift index e3822010..e5f82a8f 100644 --- a/Tests/BenchmarkTests/BenchmarkMetricsTests.swift +++ b/Tests/BenchmarkTests/BenchmarkMetricsTests.swift @@ -22,11 +22,13 @@ final class BenchmarkMetricsTests: XCTestCase { .peakMemoryResident, .peakMemoryResidentDelta, .peakMemoryVirtual, - .mallocCountSmall, - .mallocCountLarge, .mallocCountTotal, + .mallocBytesCount, + .freeCountTotal, + .mallocFreeDelta, .allocatedResidentMemory, .memoryLeaked, + .memoryLeakedBytes, .syscalls, .contextSwitches, .threads, @@ -55,11 +57,13 @@ final class BenchmarkMetricsTests: XCTestCase { "peakMemoryResident", "peakMemoryResidentDelta", "peakMemoryVirtual", - "mallocCountSmall", - "mallocCountLarge", "mallocCountTotal", + "mallocBytesCount", + "freeCountTotal", + "mallocFreeDelta", "allocatedResidentMemory", "memoryLeaked", + "memoryLeakedBytes", "syscalls", "contextSwitches", "threads", diff --git a/Tests/BenchmarkTests/MallocStatisticsTests.swift b/Tests/BenchmarkTests/MallocStatisticsTests.swift new file mode 100644 index 00000000..bd7718e9 --- /dev/null +++ b/Tests/BenchmarkTests/MallocStatisticsTests.swift @@ -0,0 +1,116 @@ +// +// Copyright (c) 2026 Ordo One AB. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// + +import XCTest + +@testable import Benchmark + +/// Unit coverage for the interposer malloc-metric arithmetic and the malloc-metric scaling +/// configuration. These exercise `BenchmarkExecutor.mallocStatistics(...)` directly with +/// synthetic counter deltas, so no live interposer / allocation is required. +final class MallocStatisticsTests: XCTestCase { + private func value( + _ metrics: [(metric: BenchmarkMetric, value: Int)], + _ wanted: BenchmarkMetric + ) -> Int? { + metrics.first { $0.metric == wanted }?.value + } + + func testBalancedAllocFreeReportsNoLeak() { + let metrics = BenchmarkExecutor.mallocStatistics( + mallocCountDelta: 10, mallocBytesDelta: 1_024, + mallocSmallDelta: 8, mallocLargeDelta: 2, + freeCountDelta: 10, freeBytesDelta: 1_024 + ) + XCTAssertEqual(value(metrics, .mallocCountTotal), 10) + XCTAssertEqual(value(metrics, .freeCountTotal), 10) + XCTAssertEqual(value(metrics, .mallocBytesCount), 1_024) + XCTAssertEqual(value(metrics, .mallocFreeDelta), 0) + XCTAssertEqual(value(metrics, .memoryLeakedBytes), 0) + XCTAssertNil(value(metrics, .memoryLeaked), "interposer stats must not emit the legacy jemalloc memoryLeaked metric") + } + + func testUnbalancedAllocReportsLeak() { + let metrics = BenchmarkExecutor.mallocStatistics( + mallocCountDelta: 10, mallocBytesDelta: 2_048, + mallocSmallDelta: 7, mallocLargeDelta: 3, + freeCountDelta: 6, freeBytesDelta: 1_024 + ) + XCTAssertEqual(value(metrics, .mallocFreeDelta), 4) // 10 mallocs - 6 frees + XCTAssertEqual(value(metrics, .memoryLeakedBytes), 1_024) // 2048 - 1024 + } + + /// A window that frees more than it allocates (e.g. freeing a warmup survivor or cross-thread + /// frees) must clamp the leak to 0 — not go negative (which `Statistics.add` would silently + /// drop, desyncing the sample count and biasing the average upward). + func testNetFreeWindowClampsLeakToZero() { + let metrics = BenchmarkExecutor.mallocStatistics( + mallocCountDelta: 3, mallocBytesDelta: 256, + mallocSmallDelta: 3, mallocLargeDelta: 0, + freeCountDelta: 5, freeBytesDelta: 4_096 + ) + XCTAssertEqual(value(metrics, .mallocFreeDelta), 0) + XCTAssertEqual(value(metrics, .memoryLeakedBytes), 0) + } + + /// `mallocStatistics` is a pure mapping: each counter delta must land in its own metric slot + /// unchanged, so a mis-routing of any single delta fails distinctly. (The `small + large == total` + /// invariant is a property of the interposer's counters, not of this function, so it cannot be + /// asserted at this layer.) + func testDeltasRouteToCorrectMetricSlots() { + let metrics = BenchmarkExecutor.mallocStatistics( + mallocCountDelta: 10, mallocBytesDelta: 100, + mallocSmallDelta: 6, mallocLargeDelta: 4, + freeCountDelta: 3, freeBytesDelta: 48 + ) + XCTAssertEqual(value(metrics, .mallocCountTotal), 10) + XCTAssertEqual(value(metrics, .mallocCountSmall), 6) + XCTAssertEqual(value(metrics, .mallocCountLarge), 4) + XCTAssertEqual(value(metrics, .mallocBytesCount), 100) + XCTAssertEqual(value(metrics, .freeCountTotal), 3) + XCTAssertEqual(value(metrics, .mallocFreeDelta), 7) + } + + /// The whole per-iteration malloc count/byte family must scale together, otherwise the scaled + /// output is internally inconsistent (e.g. `small + large != total`, or bytes not comparable + /// to free) under a non-unit `scalingFactor`. + func testMallocFamilyScalesConsistently() { + let scaledFamily: [BenchmarkMetric] = [ + .mallocCountSmall, .mallocCountLarge, .mallocCountTotal, + .freeCountTotal, .mallocBytesCount, .mallocFreeDelta, .memoryLeakedBytes, + ] + for metric in scaledFamily { + XCTAssertTrue( + metric.useScalingFactor, + "\(metric.rawDescription) must scale with the rest of the malloc family" + ) + } + } + + func testDefaultMetricsUseBackendSpecificLeakMetrics() { + #if canImport(MallocInterposerSwift) + XCTAssertTrue(BenchmarkMetric.default.contains(.mallocFreeDelta)) + XCTAssertTrue(BenchmarkMetric.default.contains(.memoryLeakedBytes)) + XCTAssertFalse( + BenchmarkMetric.default.contains(.memoryLeaked), + "interposer defaults must not emit legacy jemalloc memoryLeaked" + ) + #else + XCTAssertTrue(BenchmarkMetric.default.contains(.memoryLeaked)) + XCTAssertFalse(BenchmarkMetric.default.contains(.mallocFreeDelta)) + #endif + } + + /// Metric array slots must be unique so two metrics never collide on the same `statistics` slot. + func testMetricIndicesAreUnique() { + let indices = BenchmarkMetric.all.map(\.index) + XCTAssertEqual(Set(indices).count, indices.count, "metric indices must be unique") + } +} diff --git a/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift b/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift index c5249d37..63667f7c 100644 --- a/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift +++ b/Tests/BenchmarkTests/OperatingSystemAndMallocTests.swift @@ -62,24 +62,6 @@ final class OperatingSystemAndMallocTests: XCTestCase { blackHole(operatingSystemStatsProducer.metricSupported(.throughput)) } - #if canImport(jemalloc) - func testMallocProducerLeaks() throws { - let startMallocStats = MallocStatsProducer.makeMallocStats() - - for outerloop in 1...100 { - blackHole(malloc(outerloop * 1_024)) - } - - let stopMallocStats = MallocStatsProducer.makeMallocStats() - - XCTAssertGreaterThanOrEqual(stopMallocStats.mallocCountTotal - startMallocStats.mallocCountTotal, 100) - XCTAssertGreaterThanOrEqual( - stopMallocStats.allocatedResidentMemory - startMallocStats.allocatedResidentMemory, - 100 * 1_024 - ) - } - #endif - func testARCStatsProducer() throws { let array = [3] ARCStatsProducer.hook() diff --git a/scripts/bench_malloc.c b/scripts/bench_malloc.c new file mode 100644 index 00000000..0af0de62 --- /dev/null +++ b/scripts/bench_malloc.c @@ -0,0 +1,128 @@ +// bench_malloc.c — standalone wallclock benchmark for malloc/free patterns. +// +// Build once, run twice (with and without jemalloc injected via +// DYLD_INSERT_LIBRARIES on macOS / LD_PRELOAD on Linux). See +// scripts/bench_malloc.sh. +// +// Each benchmark runs an inner loop N times; we run K trials of that and +// report min / median / max ns per op so noise is visible. + +#include +#include +#include +#include + +#define WARMUP_ITERS 1000 +#define TRIALS 9 // odd → median is a single sample + +// Volatile sink prevents the compiler from optimizing alloc/free pairs away. +static volatile void *sink; + +static double now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec * 1e9 + (double)ts.tv_nsec; +} + +static int cmp_double(const void *a, const void *b) { + double da = *(const double *)a, db = *(const double *)b; + return (da > db) - (da < db); +} + +// ---- benchmark bodies ---- + +#define DEFINE_BENCH(NAME, BODY) \ + static void bench_##NAME(int iters) { \ + for (int _i = 0; _i < iters; _i++) { \ + BODY \ + } \ + } + +DEFINE_BENCH(malloc_64, { + void *p = malloc(64); sink = p; free(p); +}) + +DEFINE_BENCH(malloc_2mb, { + void *p = malloc(2 * 1024 * 1024); sink = p; free(p); +}) + +DEFINE_BENCH(calloc_8x8, { + void *p = calloc(8, 8); sink = p; free(p); +}) + +DEFINE_BENCH(realloc_grow, { + void *p = malloc(64); + p = realloc(p, 256); + sink = p; + free(p); +}) + +DEFINE_BENCH(realloc_null, { + void *p = realloc(NULL, 128); sink = p; free(p); +}) + +DEFINE_BENCH(posix_memalign_1k, { + void *p = NULL; + (void)posix_memalign(&p, 64, 1024); + sink = p; + free(p); +}) + +DEFINE_BENCH(malloc_x16, { + void *ptrs[16]; + for (int i = 0; i < 16; i++) ptrs[i] = malloc(48); + sink = ptrs[0]; // defeat clang's malloc/free elision at -O2 + for (int i = 0; i < 16; i++) free(ptrs[i]); +}) + +// ---- runner ---- + +typedef void (*bench_fn)(int); + +typedef struct { + const char *name; + bench_fn fn; + int inner; // iterations inside one trial +} bench_t; + +#define B(NAME, INNER) { #NAME, bench_##NAME, INNER } + +static const bench_t benchmarks[] = { + B(malloc_64, 1000000), + B(calloc_8x8, 1000000), + B(realloc_null, 1000000), + B(realloc_grow, 500000), + B(posix_memalign_1k, 1000000), + B(malloc_x16, 200000), + B(malloc_2mb, 10000), +}; + +int main(void) { + const char *label = getenv("BENCH_LABEL"); + if (!label) label = "(no label)"; + + printf("== %s ==\n", label); + printf("%-22s %12s %12s %12s\n", "benchmark", "min ns/op", "median ns/op", "max ns/op"); + printf("%-22s %12s %12s %12s\n", "---------", "---------", "------------", "---------"); + + size_t n = sizeof(benchmarks) / sizeof(benchmarks[0]); + for (size_t i = 0; i < n; i++) { + const bench_t *b = &benchmarks[i]; + + // Warmup + b->fn(WARMUP_ITERS); + + double trials[TRIALS]; + for (int t = 0; t < TRIALS; t++) { + double t0 = now_ns(); + b->fn(b->inner); + double t1 = now_ns(); + trials[t] = (t1 - t0) / (double)b->inner; + } + qsort(trials, TRIALS, sizeof(double), cmp_double); + + printf("%-22s %12.2f %12.2f %12.2f\n", + b->name, trials[0], trials[TRIALS / 2], trials[TRIALS - 1]); + } + return 0; +} diff --git a/scripts/bench_malloc.sh b/scripts/bench_malloc.sh new file mode 100755 index 00000000..d8a8f58a --- /dev/null +++ b/scripts/bench_malloc.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# +# bench_malloc.sh — build scripts/bench_malloc.c once and run it twice: +# under the system allocator and under jemalloc. Uses runtime injection +# (DYLD_INSERT_LIBRARIES on macOS, LD_PRELOAD on Linux), so there's no +# link-time difference between the two runs. +# +# Pre-requisites: +# - macOS: `brew install jemalloc` (or override JEMALLOC_LIB) +# - Linux: jemalloc installed (e.g. `apt install libjemalloc2`) +# +# Usage: +# ./scripts/bench_malloc.sh +# +# Env overrides: +# JEMALLOC_LIB path to libjemalloc.{dylib,so}; auto-detected if unset. +# CC compiler; defaults to cc. +# CFLAGS extra cflags; defaults to "-O2 -Wall -Wextra". + +set -euo pipefail + +# Use clang explicitly — `cc` is aliased to other things in many shells. +CC="${CC:-$(command -v clang || command -v gcc || echo cc)}" +CFLAGS="${CFLAGS:--O2 -Wall -Wextra}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SRC="${SCRIPT_DIR}/bench_malloc.c" +BIN="$(mktemp -t bench_malloc.XXXXXX)" +trap 'rm -f "$BIN"' EXIT + +step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; } +fail() { printf '\033[31m## %s\033[0m\n' "$*" >&2; exit 1; } + +# --- locate jemalloc --- +if [[ -z "${JEMALLOC_LIB:-}" ]]; then + case "$(uname -s)" in + Darwin) + for cand in \ + /opt/homebrew/opt/jemalloc/lib/libjemalloc.2.dylib \ + /opt/homebrew/opt/jemalloc/lib/libjemalloc.dylib \ + /usr/local/opt/jemalloc/lib/libjemalloc.2.dylib \ + /usr/local/opt/jemalloc/lib/libjemalloc.dylib; do + if [[ -f "$cand" ]]; then JEMALLOC_LIB="$cand"; break; fi + done + ;; + Linux) + for cand in \ + /usr/lib/x86_64-linux-gnu/libjemalloc.so.2 \ + /usr/lib/aarch64-linux-gnu/libjemalloc.so.2 \ + /usr/lib64/libjemalloc.so.2 \ + /usr/lib/libjemalloc.so.2 \ + /usr/lib/x86_64-linux-gnu/libjemalloc.so \ + /usr/lib/libjemalloc.so; do + if [[ -f "$cand" ]]; then JEMALLOC_LIB="$cand"; break; fi + done + ;; + esac +fi +[[ -n "${JEMALLOC_LIB:-}" && -f "$JEMALLOC_LIB" ]] \ + || fail "jemalloc dylib not found — set JEMALLOC_LIB=/path/to/libjemalloc.{dylib,so}" + +# --- build --- +step "Compiling $SRC" +# shellcheck disable=SC2086 +"$CC" $CFLAGS -o "$BIN" "$SRC" + +# --- run system allocator --- +step "Run 1 — system allocator" +BENCH_LABEL="system" "$BIN" + +# --- run with jemalloc injected --- +step "Run 2 — jemalloc (injected: $JEMALLOC_LIB)" +case "$(uname -s)" in + Darwin) + BENCH_LABEL="jemalloc" \ + DYLD_INSERT_LIBRARIES="$JEMALLOC_LIB" \ + DYLD_FORCE_FLAT_NAMESPACE=1 \ + "$BIN" + ;; + Linux) + BENCH_LABEL="jemalloc" \ + LD_PRELOAD="$JEMALLOC_LIB" \ + "$BIN" + ;; + *) + fail "Unsupported platform: $(uname -s)" + ;; +esac diff --git a/scripts/compare-malloc-local.sh b/scripts/compare-malloc-local.sh new file mode 100755 index 00000000..4b3ab492 --- /dev/null +++ b/scripts/compare-malloc-local.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +# +# compare-malloc-local.sh — compare malloc counts between the legacy jemalloc +# path (Swift 6.2 → Package@swift-6.2.swift) and the new custom interposer +# (Swift 6.3 → Package.swift) using THIS repo's local +# `MallocInterposerBenchmarks` target. +# +# These benchmarks have predictable per-iteration allocation counts, so any +# drift between the two code paths is a regression. For "real workload" +# comparison against swift-nio, see compare-malloc.sh instead. +# +# Mechanism: +# 1. Runs `swift package benchmark baseline update ` once per +# toolchain via swiftly. SwiftPM picks the right Package*.swift +# manifest for each toolchain automatically. +# 2. Calls `baseline compare` for the two recorded baselines. +# +# Pre-requisites: +# - swiftly with both toolchains installed. +# +# Usage: +# ./scripts/compare-malloc-local.sh [filter ...] +# +# Each positional arg becomes a `--filter` regex. With no args every +# benchmark in the target runs. +# +# Env overrides: +# TOOLCHAIN_OLD default 6.2.2 +# TOOLCHAIN_NEW default 6.3-snapshot-2026-02-27 +# FRESH=1 use timestamp-suffixed scratch dirs (fresh build, no +# cache reuse). Use this when a previous hung/zombie +# process is holding a SwiftPM lock on .build-X and you +# can't kill it. Trade-off: full rebuild each run. +# KEEP_FRESH=1 when FRESH=1, don't auto-delete the scratch dirs at +# exit (default is to clean up on success). + +set -euo pipefail + +PB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +BENCH_DIR="${PB_DIR}/Benchmarks" +TARGET="MallocInterposerBenchmarks" +TOOLCHAIN_OLD="${TOOLCHAIN_OLD:-6.2.4}" +TOOLCHAIN_NEW="${TOOLCHAIN_NEW:-6.3}" +BASELINE_OLD="jemalloc-${TOOLCHAIN_OLD}" +BASELINE_NEW="interposer-${TOOLCHAIN_NEW}" + +# Per-toolchain scratch paths so each toolchain has its own .build cache. +# Without this, switching toolchains hits "module compiled with Swift X +# cannot be imported by Y" errors on the cached Benchmark.swiftmodule. +# +# If FRESH=1 is set, append a timestamp suffix so this run can't collide +# with a SwiftPM lock held by a previous (possibly hung) process. Trade-off: +# no cache reuse — every run rebuilds from scratch. +SCRATCH_SUFFIX="" +if [[ "${FRESH:-0}" == "1" ]]; then + SCRATCH_SUFFIX="-fresh-$(date +%s)" +fi +SCRATCH_OLD="${BENCH_DIR}/.build-${TOOLCHAIN_OLD}${SCRATCH_SUFFIX}" +SCRATCH_NEW="${BENCH_DIR}/.build-${TOOLCHAIN_NEW}${SCRATCH_SUFFIX}" + +step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; } +warn() { printf '\033[33m!! %s\033[0m\n' "$*" >&2; } +fail() { + printf '\033[31m## %s\033[0m\n' "$*" >&2 + exit 1 +} + +[[ -d "$BENCH_DIR/Benchmarks/MallocInterposer" ]] || + fail "MallocInterposer benchmark dir missing — expected $BENCH_DIR/Benchmarks/MallocInterposer" +command -v swiftly >/dev/null || fail "swiftly required" + +# When FRESH=1, clean the throwaway scratch dirs on successful exit so they +# don't accumulate. KEEP_FRESH=1 disables this if the user wants to inspect. +if [[ "${FRESH:-0}" == "1" && "${KEEP_FRESH:-0}" != "1" ]]; then + cleanup_fresh() { + local rc=$? + if ((rc == 0)); then + rm -rf "$SCRATCH_OLD" "$SCRATCH_NEW" 2>/dev/null || true + else + warn "Run failed (exit $rc); leaving fresh scratch dirs for inspection:" + warn " $SCRATCH_OLD" + warn " $SCRATCH_NEW" + fi + } + trap cleanup_fresh EXIT +fi + +cd "$BENCH_DIR" + +# Forward any positional args as --filter regexes. +declare -a FILTER_ARGS=() +for f in "$@"; do + FILTER_ARGS+=(--filter "$f") +done + +run_jemalloc() { + step "Run 1: Swift $TOOLCHAIN_OLD (jemalloc) → baseline '$BASELINE_OLD' [scratch: $SCRATCH_OLD]" + swiftly run +"$TOOLCHAIN_OLD" \ + swift package \ + --scratch-path "$SCRATCH_OLD" \ + --allow-writing-to-package-directory benchmark \ + baseline update "$BASELINE_OLD" \ + --target "$TARGET" \ + --quiet --no-progress \ + "${FILTER_ARGS[@]}" +} + +run_interposer() { + step "Run 2: Swift $TOOLCHAIN_NEW (interposer) → baseline '$BASELINE_NEW' [scratch: $SCRATCH_NEW]" + swiftly run +"$TOOLCHAIN_NEW" \ + swift package \ + --scratch-path "$SCRATCH_NEW" \ + --allow-writing-to-package-directory benchmark \ + baseline update "$BASELINE_NEW" \ + --target "$TARGET" \ + --quiet --no-progress \ + "${FILTER_ARGS[@]}" +} + +run_jemalloc +run_interposer + +step "Comparison: $BASELINE_OLD vs $BASELINE_NEW" +swiftly run +"$TOOLCHAIN_NEW" \ + swift package \ + --scratch-path "$SCRATCH_NEW" \ + benchmark baseline compare "$BASELINE_OLD" "$BASELINE_NEW" \ + --target "$TARGET" diff --git a/scripts/wrapper_overhead.c b/scripts/wrapper_overhead.c new file mode 100644 index 00000000..008f22a7 --- /dev/null +++ b/scripts/wrapper_overhead.c @@ -0,0 +1,94 @@ +// wrapper_overhead.c — measure the cost of "being a wrapper" in isolation, +// and (optionally) the additional cost of the real interposer's bookkeeping. +// +// Run the same malloc/free hot loop two or three times: +// 1. With nothing preloaded → user code → libc allocator. +// 2. With wrapper_overhead_passthrough.dylib preloaded → user code → our +// one-instruction tail-call wrapper → libc allocator. +// Delta from #1 = wrapper layer cost (no bookkeeping at all). +// 3. (Optional) With the real malloc-interposer preloaded and counting +// enabled. Delta from #2 = bookkeeping cost (header + magic check + +// enable check + TLS pointer load + counter writes). +// +// To enable run #3, set INTERPOSER_DYLIB in the environment to the path of +// the full interposer dylib/so. The harness will dlsym +// `malloc_interposer_enable` and call it at startup so counting is on for +// every measured iteration. +// +// Build + drive: see wrapper_overhead.sh in the same directory. + +#include +#include +#include +#include +#include + +#define WARMUP_ITERS 10000 +#define INNER_ITERS 2000000 +#define TRIALS 9 + +static volatile void *sink; + +static double now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec * 1e9 + (double)ts.tv_nsec; +} + +static int cmp_double(const void *a, const void *b) { + double da = *(const double *)a, db = *(const double *)b; + return (da > db) - (da < db); +} + +static void measure_pair(const char *name, size_t size) { + // Warmup primes tcache and lets dyld bind any lazy stubs. + for (int i = 0; i < WARMUP_ITERS; i++) { + void *p = malloc(size); + sink = p; + free(p); + } + + double trials[TRIALS]; + for (int t = 0; t < TRIALS; t++) { + double t0 = now_ns(); + for (int i = 0; i < INNER_ITERS; i++) { + void *p = malloc(size); + sink = p; + free(p); + } + trials[t] = (now_ns() - t0) / (double)INNER_ITERS; + } + qsort(trials, TRIALS, sizeof(double), cmp_double); + + printf("%-18s %10.2f %10.2f %10.2f\n", + name, trials[0], trials[TRIALS / 2], trials[TRIALS - 1]); +} + +int main(void) { + const char *label = getenv("BENCH_LABEL"); + if (!label) label = "(no label)"; + + // If the real malloc-interposer is preloaded, flip its counting on so we + // measure the full bookkeeping cost (header + magic check + enable check + // + TLS access + counter writes). dlsym returns NULL for the pass-through + // wrapper and for the plain libc run, which is exactly what we want. + void (*enable_fn)(void) = (void (*)(void))dlsym(RTLD_DEFAULT, + "malloc_interposer_enable"); + void (*reset_fn)(void) = (void (*)(void))dlsym(RTLD_DEFAULT, + "malloc_interposer_reset"); + if (enable_fn) { + if (reset_fn) reset_fn(); + enable_fn(); + fprintf(stderr, "[%s] interposer counting enabled\n", label); + } + + printf("== %s ==\n", label); + printf("%-18s %10s %10s %10s\n", "size", "min ns", "median", "max ns"); + printf("%-18s %10s %10s %10s\n", "----", "------", "------", "------"); + + measure_pair("malloc(64)+free", 64); + measure_pair("malloc(256)+free", 256); + measure_pair("malloc(1024)+free", 1024); + measure_pair("malloc(4096)+free", 4096); + return 0; +} diff --git a/scripts/wrapper_overhead.sh b/scripts/wrapper_overhead.sh new file mode 100755 index 00000000..a01ea412 --- /dev/null +++ b/scripts/wrapper_overhead.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# +# wrapper_overhead.sh — show the irreducible cost of "being a wrapper" in +# isolation, with no header / no counters / no enable check / nothing. +# +# Builds two artifacts: +# - bin/wrapper_overhead the hot-loop bench +# - bin/wrapper_passthrough.{dylib,so} a do-nothing tail-call interposer +# +# Runs the bench twice: +# 1. Plain — user code → libc malloc. +# 2. Wrapped — user code → tail-call wrapper → libc malloc. +# +# Whatever ns delta you see is the price of the extra function-call layer +# alone. Anything you'd build on top (header, counters, enable check) +# stacks on top of that. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="$(mktemp -d -t wrapper_overhead.XXXXXX)" +trap 'rm -rf "$BUILD_DIR"' EXIT + +CC="${CC:-$(command -v clang || command -v gcc || echo cc)}" +CFLAGS="${CFLAGS:--O2 -Wall -Wextra}" + +step() { printf '\n\033[1;36m== %s\033[0m\n' "$*"; } +fail() { printf '\033[31m## %s\033[0m\n' "$*" >&2; exit 1; } + +# --- Build --- +step "Building bench harness + pass-through wrapper" +"$CC" $CFLAGS -o "$BUILD_DIR/wrapper_overhead" "$SCRIPT_DIR/wrapper_overhead.c" + +# Collect injection env vars in a bash array so they pass cleanly to `env`. +declare -a INJECT_ENV=() +case "$(uname -s)" in + Darwin) + WRAPPER_LIB="$BUILD_DIR/libwrapper_passthrough.dylib" + "$CC" $CFLAGS -dynamiclib -o "$WRAPPER_LIB" \ + "$SCRIPT_DIR/wrapper_overhead_passthrough.c" + INJECT_ENV+=("DYLD_INSERT_LIBRARIES=$WRAPPER_LIB" "DYLD_FORCE_FLAT_NAMESPACE=1") + ;; + Linux) + WRAPPER_LIB="$BUILD_DIR/libwrapper_passthrough.so" + "$CC" $CFLAGS -fPIC -shared -o "$WRAPPER_LIB" \ + "$SCRIPT_DIR/wrapper_overhead_passthrough.c" -ldl + INJECT_ENV+=("LD_PRELOAD=$WRAPPER_LIB") + ;; + *) + fail "Unsupported platform: $(uname -s)" + ;; +esac + +# --- Run plain --- +step "Run 1 — plain (no wrapper)" +BENCH_LABEL="plain" "$BUILD_DIR/wrapper_overhead" + +# --- Run wrapped --- +step "Run 2 — pass-through wrapper preloaded ($(basename "$WRAPPER_LIB"))" +env BENCH_LABEL="wrapped" "${INJECT_ENV[@]}" "$BUILD_DIR/wrapper_overhead" + +# --- Run full interposer (optional) --- +# If the caller points us at the real malloc-interposer dylib, do a third run +# with counting enabled. Delta from run #2 is the real bookkeeping cost. +if [[ -n "${INTERPOSER_DYLIB:-}" ]]; then + if [[ ! -f "$INTERPOSER_DYLIB" ]]; then + fail "INTERPOSER_DYLIB=$INTERPOSER_DYLIB does not exist" + fi + + declare -a FULL_INJECT=() + case "$(uname -s)" in + Darwin) + FULL_INJECT+=("DYLD_INSERT_LIBRARIES=$INTERPOSER_DYLIB" + "DYLD_FORCE_FLAT_NAMESPACE=1") + ;; + Linux) + FULL_INJECT+=("LD_PRELOAD=$INTERPOSER_DYLIB") + ;; + esac + + step "Run 3 — full malloc-interposer preloaded, counting ON" + env BENCH_LABEL="full-interposer" "${FULL_INJECT[@]}" "$BUILD_DIR/wrapper_overhead" +fi + +cat <<'EOF' + +Reading the output: + delta(plain → wrapped) = cost of the wrapper layer alone (no logic). + delta(wrapped → full) = cost of header + magic check + enable check + + TLS pointer + counter writes (the + "bookkeeping" on top of the wrapper). + delta(plain → full) = total interposer overhead vs. raw libc. + +If only runs 1 and 2 appear, set INTERPOSER_DYLIB= to enable run 3. +EOF diff --git a/scripts/wrapper_overhead_passthrough.c b/scripts/wrapper_overhead_passthrough.c new file mode 100644 index 00000000..146dae84 --- /dev/null +++ b/scripts/wrapper_overhead_passthrough.c @@ -0,0 +1,152 @@ +// wrapper_overhead_passthrough.c — a bare malloc/free interposer that does +// NOTHING beyond what an empty wrapper does. No header, no counters, no +// enable check, no TLS, no atomics. Each replacement_* is a single- +// instruction tail call to libc. +// +// Used by wrapper_overhead.sh to isolate the cost of the wrapper layer +// itself — independent of any bookkeeping you might layer on top. +// +// macOS path: DYLD_INTERPOSE entries route malloc/free through us via +// the standard __DATA,__interpose section. Internal calls to malloc/free +// inside this dylib resolve directly to libsystem. +// +// Linux path: defining `malloc` / `free` in an LD_PRELOAD'd shared object +// overrides the global symbol resolution. We forward to the real libc +// entries via dlsym(RTLD_NEXT, …). The resolve dance is a small one-time +// cost amortised away after warmup, so it doesn't pollute the measurement. + +#include + +#if defined(__APPLE__) + +#define DYLD_INTERPOSE(_replacement, _replacee) \ + __attribute__((used)) static struct { \ + const void *replacement; \ + const void *replacee; \ + } _interpose_##_replacee __attribute__((section("__DATA,__interpose"))) = { \ + (const void *)&_replacement, (const void *)&_replacee \ + }; + +void *replacement_malloc(size_t size) { return malloc(size); } +void replacement_free(void *p) { free(p); } +void *replacement_calloc(size_t n, size_t s) { return calloc(n, s); } +void *replacement_realloc(void *p, size_t s) { return realloc(p, s); } +void *replacement_reallocf(void *p, size_t s) { return reallocf(p, s); } +void *replacement_valloc(size_t s) { return valloc(s); } +int replacement_posix_memalign(void **m, size_t a, size_t s) { + return posix_memalign(m, a, s); +} + +DYLD_INTERPOSE(replacement_malloc, malloc) +DYLD_INTERPOSE(replacement_free, free) +DYLD_INTERPOSE(replacement_calloc, calloc) +DYLD_INTERPOSE(replacement_realloc, realloc) +DYLD_INTERPOSE(replacement_reallocf, reallocf) +DYLD_INTERPOSE(replacement_valloc, valloc) +DYLD_INTERPOSE(replacement_posix_memalign, posix_memalign) + +#else /* Linux */ + +// On Linux we resolve the real libc functions via dlsym(RTLD_NEXT, …) and +// cache the function pointers. The wrinkle: dlsym itself can call calloc +// internally during symbol resolution, which would recurse back into our +// hooks. We guard against that with a thread-local "in dlsym" flag and a +// small static bootstrap buffer that absorbs any allocations made while +// resolving. +// +// After resolution completes (which happens during the constructor, before +// the bench's hot loop runs), the steady-state hot path is just: +// ldr x_real_fn +// blr x_real_fn +// — one load, one indirect call. Same shape as glibc's own PLT stub, so +// the wrapper-layer cost is just the extra branch. + +#define _GNU_SOURCE +#include +#include + +static void *(*real_malloc)(size_t) = NULL; +static void (*real_free)(void *) = NULL; +static void *(*real_calloc)(size_t, size_t) = NULL; +static void *(*real_realloc)(void *, size_t)= NULL; + +// TLS guard: set while we're inside dlsym so any reentrant malloc/calloc/ +// realloc/free calls go to the bootstrap path instead of recursing. +static __thread int g_in_resolve = 0; + +// Small static buffer for allocations made during dlsym resolution. +// 64 KiB is more than enough — dlsym typically allocates only a handful of +// small objects during the first call. +static char g_boot_mem[64 * 1024]; +static size_t g_boot_off = 0; + +static int boot_owns(const void *p) { + return (const char *)p >= g_boot_mem && + (const char *)p < g_boot_mem + sizeof(g_boot_mem); +} + +static void *boot_alloc(size_t n) { + size_t aligned = (n + 15) & ~(size_t)15; + if (g_boot_off + aligned > sizeof(g_boot_mem)) return NULL; + void *p = g_boot_mem + g_boot_off; + g_boot_off += aligned; + return p; +} + +static void resolve_real(void) { + g_in_resolve = 1; + real_malloc = dlsym(RTLD_NEXT, "malloc"); + real_free = dlsym(RTLD_NEXT, "free"); + real_calloc = dlsym(RTLD_NEXT, "calloc"); + real_realloc = dlsym(RTLD_NEXT, "realloc"); + g_in_resolve = 0; +} + +__attribute__((constructor)) static void preresolve(void) { + resolve_real(); +} + +void *malloc(size_t s) { + if (__builtin_expect(real_malloc != NULL, 1)) return real_malloc(s); + if (g_in_resolve) return boot_alloc(s); + resolve_real(); + return real_malloc ? real_malloc(s) : boot_alloc(s); +} + +void free(void *p) { + if (!p) return; + if (boot_owns(p)) return; // bootstrap blocks have no underlying chunk + if (__builtin_expect(real_free != NULL, 1)) { real_free(p); return; } + if (g_in_resolve) return; + resolve_real(); + if (real_free) real_free(p); +} + +void *calloc(size_t n, size_t s) { + if (__builtin_expect(real_calloc != NULL, 1)) return real_calloc(n, s); + if (g_in_resolve) { + void *p = boot_alloc(n * s); + if (p) memset(p, 0, n * s); + return p; + } + resolve_real(); + if (real_calloc) return real_calloc(n, s); + void *p = boot_alloc(n * s); + if (p) memset(p, 0, n * s); + return p; +} + +void *realloc(void *p, size_t s) { + if (boot_owns(p)) { + // Can't realloc a bootstrap allocation in place; copy out via malloc. + void *np = malloc(s); + if (np && p) memcpy(np, p, s); + return np; + } + if (__builtin_expect(real_realloc != NULL, 1)) return real_realloc(p, s); + if (g_in_resolve) return boot_alloc(s); + resolve_real(); + return real_realloc ? real_realloc(p, s) : boot_alloc(s); +} + +#endif