From 98c86c54a0eb67be86ec4d31f48f6c08d8c8ae6c Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Mon, 5 Jan 2026 12:41:32 +0100
Subject: [PATCH 1/3] Improve NEON instructions for BitPacker4x

Use NEON registers (uint32x4_t) instead of scalar [u32;4] arrays.
faster benches

Signed-off-by: Pascal Seitz <pascal.seitz@gmail.com>
---
 .github/workflows/ci.yml |  30 ++++++++++++
 src/bitpacker4x.rs       | 103 ++++++++++++++++++++++++++++-----------
 src/bitpacking_bench.rs  |  23 ++++++++-
 src/tests.rs             |   4 +-
 4 files changed, 128 insertions(+), 32 deletions(-)
 create mode 100644 .github/workflows/ci.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..989c972
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,30 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    name: test (${{ matrix.name }})
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: x86_64
+            runs-on: ubuntu-latest
+          - name: arm64
+            runs-on: ubuntu-22.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+
+      - name: Run tests
+        run: cargo test
diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs
index 34e1853..d5a6c3a 100644
--- a/src/bitpacker4x.rs
+++ b/src/bitpacker4x.rs
@@ -83,48 +83,95 @@ mod neon {
 
     use super::BLOCK_LEN;
     use crate::Available;
+    use std::arch::aarch64::{
+        uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_s32, vdupq_n_u32, vextq_u32, vgetq_lane_u32,
+        vld1q_u32, vorrq_u32, vshlq_u32, vst1q_u32, vsubq_u32,
+    };
+
+    pub(crate) type DataType = uint32x4_t;
+
+    #[inline]
+    /// Creates a vector with all elements set to `el`.
+    unsafe fn set1(el: i32) -> DataType {
+        vdupq_n_u32(el as u32)
+    }
+
+    #[inline]
+    unsafe fn right_shift_32<const N: i32>(el: DataType) -> DataType {
+        // Create a vector with all elements set to -N
+        // negative shift amount means right shift
+        // Each lane is shifted by the corresponding value in the shift vector
+        let shift = vdupq_n_s32(-N);
+        vshlq_u32(el, shift)
+    }
+
+    #[inline]
+    unsafe fn left_shift_32<const N: i32>(el: DataType) -> DataType {
+        // Create a vector with all elements set to N
+        // positive shift amount means left shift
+        // Each lane is shifted by the corresponding value in the shift vector
+        let shift = vdupq_n_s32(N);
+        vshlq_u32(el, shift)
+    }
+
+    #[inline]
+    unsafe fn op_or(left: DataType, right: DataType) -> DataType {
+        // Bitwise OR of two vectors
+        vorrq_u32(left, right)
+    }
+
+    #[inline]
+    unsafe fn op_and(left: DataType, right: DataType) -> DataType {
+        vandq_u32(left, right)
+    }
+
+    #[inline]
+    unsafe fn load_unaligned(addr: *const DataType) -> DataType {
+        vld1q_u32(addr.cast::<u32>())
+    }
+
+    #[inline]
+    unsafe fn store_unaligned(addr: *mut DataType, data: DataType) {
+        vst1q_u32(addr.cast::<u32>(), data);
+    }
+
+    #[inline]
+    /// Collapses the vector by performing a bitwise OR across all lanes
+    unsafe fn or_collapse_to_u32(acc: DataType) -> u32 {
+        vgetq_lane_u32(acc, 0)
+            | vgetq_lane_u32(acc, 1)
+            | vgetq_lane_u32(acc, 2)
+            | vgetq_lane_u32(acc, 3)
+    }
 
-    use super::scalar::add;
-    use super::scalar::left_shift_32;
-    use super::scalar::load_unaligned;
-    use super::scalar::op_and;
-    use super::scalar::op_or;
-    use super::scalar::or_collapse_to_u32;
-    use super::scalar::right_shift_32;
-    use super::scalar::set1;
-    use super::scalar::store_unaligned;
-    use super::scalar::sub;
-    use super::scalar::DataType;
-    use std::arch::aarch64::{vaddq_u32, vdupq_n_u32, vextq_u32, vld1q_u32, vst1q_u32, vsubq_u32};
-
-    #[target_feature(enable = "neon")]
     unsafe fn compute_delta(curr: DataType, prev: DataType) -> DataType {
-        let c = vld1q_u32(curr.as_ptr());
-        let p = vld1q_u32(prev.as_ptr());
-        let mut r = set1(0);
-        vst1q_u32(r.as_mut_ptr(), vsubq_u32(c, vextq_u32(p, c, 3)));
-        r
+        // Build a vector with [prev[3], curr[0], curr[1], curr[2]]
+        let prev_shifted = vextq_u32(prev, curr, 3);
+        vsubq_u32(curr, prev_shifted)
     }
 
-    #[target_feature(enable = "neon")]
     #[allow(non_snake_case)]
     #[inline]
     unsafe fn integrate_delta(prev: DataType, delta: DataType) -> DataType {
-        let base = vdupq_n_u32(prev[3]);
+        let base = vdupq_n_u32(vgetq_lane_u32(prev, 3));
         let zero = vdupq_n_u32(0);
-        let a__b__c__d_ = vld1q_u32(delta.as_ptr());
+        let a__b__c__d_ = delta;
         let ______a__b_ = vextq_u32(zero, a__b__c__d_, 2);
         let a__b__ca_db = vaddq_u32(______a__b_, a__b__c__d_);
         let ___a__b__ca = vextq_u32(zero, a__b__ca_db, 3);
         let a_ab_abc_abcd = vaddq_u32(___a__b__ca, a__b__ca_db);
-        let mut r = set1(0);
-        vst1q_u32(r.as_mut_ptr(), vaddq_u32(base, a_ab_abc_abcd));
-        r
+        vaddq_u32(base, a_ab_abc_abcd)
+    }
+
+    #[inline]
+    unsafe fn add(left: DataType, right: DataType) -> DataType {
+        vaddq_u32(left, right)
     }
 
-    // TODO trinity-1686a: I believe add/sub are easy enough for the compiler to optimize on its
-    // own, and suspect hand-rolled impl would force (un)loading registers and make things slower
-    // overall
+    #[inline]
+    unsafe fn sub(left: DataType, right: DataType) -> DataType {
+        vsubq_u32(left, right)
+    }
 
     declare_bitpacker!(target_feature(enable = "neon"));
 
diff --git a/src/bitpacking_bench.rs b/src/bitpacking_bench.rs
index 9cd8c30..eb09c61 100644
--- a/src/bitpacking_bench.rs
+++ b/src/bitpacking_bench.rs
@@ -1,10 +1,13 @@
-use criterion::{Bencher, Criterion, criterion_group, criterion_main};
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use std::time::Duration;
 
 use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
 use criterion::Benchmark;
 use criterion::Throughput;
 
 const NUM_BLOCKS: usize = 10;
+const SAMPLE_SIZE: usize = 10;
+const WARM_UP_TIME: Duration = Duration::from_millis(50);
 
 fn integrate_data(initial: u32, data: &mut [u32]) {
     let mut cumul = initial;
@@ -245,6 +248,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
             Benchmark::new(format!("decompress-{num_bit}").as_str(), move |b| {
                 bench_decompress_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
             })
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -254,6 +259,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
             Benchmark::new(format!("decompress-delta-{num_bit}").as_str(), move |b| {
                 bench_decompress_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
             })
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -266,6 +273,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
                     bench_decompress_strict_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
                 },
             )
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -275,6 +284,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
             Benchmark::new(format!("compress-{num_bit}").as_str(), move |b| {
                 bench_compress_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
             })
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -284,6 +295,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
             Benchmark::new(format!("compress-delta-{num_bit}").as_str(), move |b| {
                 bench_compress_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
             })
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -296,6 +309,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
                     bench_compress_strict_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
                 },
             )
+            .warm_up_time(WARM_UP_TIME)
+            .sample_size(SAMPLE_SIZE)
             .throughput(Throughput::Elements(
                 (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
             )),
@@ -309,5 +324,9 @@ fn criterion_benchmark(criterion: &mut Criterion) {
     criterion_benchmark_bitpacker("BitPacker8x", BitPacker8x::new(), criterion);
 }
 
-criterion_group!(benches, criterion_benchmark);
+criterion_group! {
+    name = benches;
+    config = Criterion::default().warm_up_time(Duration::from_millis(50));
+    targets = criterion_benchmark
+}
 criterion_main!(benches);
diff --git a/src/tests.rs b/src/tests.rs
index fc15694..0fb9280 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -1,8 +1,8 @@
+use super::most_significant_bit;
+use super::UnsafeBitPacker;
 use rand::distributions::{Distribution as _, Uniform};
 use rand::rngs::StdRng;
 use rand::SeedableRng as _;
-use super::most_significant_bit;
-use super::UnsafeBitPacker;
 
 pub fn generate_array(n: usize, max_num_bits: u8) -> Vec<u32> {
     assert!(max_num_bits <= 32u8);

From 7e8a0091201abdcef29ace158eb25b78d8072c6a Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Tue, 6 Jan 2026 14:26:33 +0100
Subject: [PATCH 2/3] compile time shifts

---
 .github/workflows/ci.yml | 30 --------------
 src/bitpacker4x.rs       | 88 ++++++++++++++++++++++++++++++++++------
 2 files changed, 76 insertions(+), 42 deletions(-)
 delete mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index 989c972..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: CI
-
-on:
-  push:
-  pull_request:
-
-jobs:
-  test:
-    name: test (${{ matrix.name }})
-    runs-on: ${{ matrix.runs-on }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - name: x86_64
-            runs-on: ubuntu-latest
-          - name: arm64
-            runs-on: ubuntu-22.04-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Rust cache
-        uses: Swatinem/rust-cache@v2
-
-      - name: Run tests
-        run: cargo test
diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs
index d5a6c3a..d493fe9 100644
--- a/src/bitpacker4x.rs
+++ b/src/bitpacker4x.rs
@@ -84,8 +84,8 @@ mod neon {
     use super::BLOCK_LEN;
     use crate::Available;
     use std::arch::aarch64::{
-        uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_s32, vdupq_n_u32, vextq_u32, vgetq_lane_u32,
-        vld1q_u32, vorrq_u32, vshlq_u32, vst1q_u32, vsubq_u32,
+        uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_u32, vextq_u32, vgetq_lane_u32, vld1q_u32,
+        vorrq_u32, vshlq_n_u32, vshrq_n_u32, vst1q_u32, vsubq_u32,
     };
 
     pub(crate) type DataType = uint32x4_t;
@@ -98,20 +98,84 @@ mod neon {
 
     #[inline]
     unsafe fn right_shift_32<const N: i32>(el: DataType) -> DataType {
-        // Create a vector with all elements set to -N
-        // negative shift amount means right shift
-        // Each lane is shifted by the corresponding value in the shift vector
-        let shift = vdupq_n_s32(-N);
-        vshlq_u32(el, shift)
+        // We unroll here because vshrq_n_u32 only accepts constants from 1 to 32.
+        match N {
+            0 => el,
+            1 => vshrq_n_u32::<1>(el),
+            2 => vshrq_n_u32::<2>(el),
+            3 => vshrq_n_u32::<3>(el),
+            4 => vshrq_n_u32::<4>(el),
+            5 => vshrq_n_u32::<5>(el),
+            6 => vshrq_n_u32::<6>(el),
+            7 => vshrq_n_u32::<7>(el),
+            8 => vshrq_n_u32::<8>(el),
+            9 => vshrq_n_u32::<9>(el),
+            10 => vshrq_n_u32::<10>(el),
+            11 => vshrq_n_u32::<11>(el),
+            12 => vshrq_n_u32::<12>(el),
+            13 => vshrq_n_u32::<13>(el),
+            14 => vshrq_n_u32::<14>(el),
+            15 => vshrq_n_u32::<15>(el),
+            16 => vshrq_n_u32::<16>(el),
+            17 => vshrq_n_u32::<17>(el),
+            18 => vshrq_n_u32::<18>(el),
+            19 => vshrq_n_u32::<19>(el),
+            20 => vshrq_n_u32::<20>(el),
+            21 => vshrq_n_u32::<21>(el),
+            22 => vshrq_n_u32::<22>(el),
+            23 => vshrq_n_u32::<23>(el),
+            24 => vshrq_n_u32::<24>(el),
+            25 => vshrq_n_u32::<25>(el),
+            26 => vshrq_n_u32::<26>(el),
+            27 => vshrq_n_u32::<27>(el),
+            28 => vshrq_n_u32::<28>(el),
+            29 => vshrq_n_u32::<29>(el),
+            30 => vshrq_n_u32::<30>(el),
+            31 => vshrq_n_u32::<31>(el),
+            32 => vshrq_n_u32::<32>(el),
+            _ => core::hint::unreachable_unchecked(),
+        }
     }
 
     #[inline]
     unsafe fn left_shift_32<const N: i32>(el: DataType) -> DataType {
-        // Create a vector with all elements set to N
-        // positive shift amount means left shift
-        // Each lane is shifted by the corresponding value in the shift vector
-        let shift = vdupq_n_s32(N);
-        vshlq_u32(el, shift)
+        // We unroll here because vshlq_n_u32 only accepts constants from 1 to 32.
+        match N {
+            0 => el,
+            1 => vshlq_n_u32::<1>(el),
+            2 => vshlq_n_u32::<2>(el),
+            3 => vshlq_n_u32::<3>(el),
+            4 => vshlq_n_u32::<4>(el),
+            5 => vshlq_n_u32::<5>(el),
+            6 => vshlq_n_u32::<6>(el),
+            7 => vshlq_n_u32::<7>(el),
+            8 => vshlq_n_u32::<8>(el),
+            9 => vshlq_n_u32::<9>(el),
+            10 => vshlq_n_u32::<10>(el),
+            11 => vshlq_n_u32::<11>(el),
+            12 => vshlq_n_u32::<12>(el),
+            13 => vshlq_n_u32::<13>(el),
+            14 => vshlq_n_u32::<14>(el),
+            15 => vshlq_n_u32::<15>(el),
+            16 => vshlq_n_u32::<16>(el),
+            17 => vshlq_n_u32::<17>(el),
+            18 => vshlq_n_u32::<18>(el),
+            19 => vshlq_n_u32::<19>(el),
+            20 => vshlq_n_u32::<20>(el),
+            21 => vshlq_n_u32::<21>(el),
+            22 => vshlq_n_u32::<22>(el),
+            23 => vshlq_n_u32::<23>(el),
+            24 => vshlq_n_u32::<24>(el),
+            25 => vshlq_n_u32::<25>(el),
+            26 => vshlq_n_u32::<26>(el),
+            27 => vshlq_n_u32::<27>(el),
+            28 => vshlq_n_u32::<28>(el),
+            29 => vshlq_n_u32::<29>(el),
+            30 => vshlq_n_u32::<30>(el),
+            31 => vshlq_n_u32::<31>(el),
+            32 => vdupq_n_u32(0),
+            _ => core::hint::unreachable_unchecked(),
+        }
     }
 
     #[inline]

From 855806bda8ea8bac24b953ad87b057bfb53c76b6 Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Thu, 8 Jan 2026 14:39:38 +0100
Subject: [PATCH 3/3] apply cr comments

---
 src/bitpacker4x.rs | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs
index d493fe9..7cbe5f4 100644
--- a/src/bitpacker4x.rs
+++ b/src/bitpacker4x.rs
@@ -98,6 +98,11 @@ mod neon {
 
     #[inline]
     unsafe fn right_shift_32<const N: i32>(el: DataType) -> DataType {
+        const {
+                assert!(N >= 0);
+                assert!(N <= 32);
+        }
+
         // We unroll here because vshrq_n_u32 only accepts constants from 1 to 32.
         match N {
             0 => el,
@@ -132,14 +137,19 @@ mod neon {
             29 => vshrq_n_u32::<29>(el),
             30 => vshrq_n_u32::<30>(el),
             31 => vshrq_n_u32::<31>(el),
-            32 => vshrq_n_u32::<32>(el),
+            32 => vdupq_n_u32(0),
             _ => core::hint::unreachable_unchecked(),
         }
     }
 
     #[inline]
     unsafe fn left_shift_32<const N: i32>(el: DataType) -> DataType {
-        // We unroll here because vshlq_n_u32 only accepts constants from 1 to 32.
+        const {
+                assert!(N >= 0);
+                assert!(N <= 32);
+        }
+
+        // We unroll here because vshlq_n_u32 only accepts constants from 0 to 31.
         match N {
             0 => el,
             1 => vshlq_n_u32::<1>(el),
@@ -178,11 +188,7 @@ mod neon {
         }
     }
 
-    #[inline]
-    unsafe fn op_or(left: DataType, right: DataType) -> DataType {
-        // Bitwise OR of two vectors
-        vorrq_u32(left, right)
-    }
+    use vorrq_u32 as op_or;
 
     #[inline]
     unsafe fn op_and(left: DataType, right: DataType) -> DataType {