From 98c86c54a0eb67be86ec4d31f48f6c08d8c8ae6c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 5 Jan 2026 12:41:32 +0100 Subject: [PATCH 1/3] Improve NEON instructions for BitPacker4x Use NEON registers (uint32x4_t) instead of scalar [u32;4] arrays. faster benches Signed-off-by: Pascal Seitz --- .github/workflows/ci.yml | 30 ++++++++++++ src/bitpacker4x.rs | 103 ++++++++++++++++++++++++++++----------- src/bitpacking_bench.rs | 23 ++++++++- src/tests.rs | 4 +- 4 files changed, 128 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..989c972 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI + +on: + push: + pull_request: + +jobs: + test: + name: test (${{ matrix.name }}) + runs-on: ${{ matrix.runs-on }} + strategy: + fail-fast: false + matrix: + include: + - name: x86_64 + runs-on: ubuntu-latest + - name: arm64 + runs-on: ubuntu-22.04-arm + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Rust cache + uses: Swatinem/rust-cache@v2 + + - name: Run tests + run: cargo test diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs index 34e1853..d5a6c3a 100644 --- a/src/bitpacker4x.rs +++ b/src/bitpacker4x.rs @@ -83,48 +83,95 @@ mod neon { use super::BLOCK_LEN; use crate::Available; + use std::arch::aarch64::{ + uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_s32, vdupq_n_u32, vextq_u32, vgetq_lane_u32, + vld1q_u32, vorrq_u32, vshlq_u32, vst1q_u32, vsubq_u32, + }; + + pub(crate) type DataType = uint32x4_t; + + #[inline] + /// Creates a vector with all elements set to `el`. + unsafe fn set1(el: i32) -> DataType { + vdupq_n_u32(el as u32) + } + + #[inline] + unsafe fn right_shift_32(el: DataType) -> DataType { + // Create a vector with all elements set to -N + // negative shift amount means right shift + // Each lane is shifted by the corresponding value in the shift vector + let shift = vdupq_n_s32(-N); + vshlq_u32(el, shift) + } + + #[inline] + unsafe fn left_shift_32(el: DataType) -> DataType { + // Create a vector with all elements set to N + // positive shift amount means left shift + // Each lane is shifted by the corresponding value in the shift vector + let shift = vdupq_n_s32(N); + vshlq_u32(el, shift) + } + + #[inline] + unsafe fn op_or(left: DataType, right: DataType) -> DataType { + // Bitwise OR of two vectors + vorrq_u32(left, right) + } + + #[inline] + unsafe fn op_and(left: DataType, right: DataType) -> DataType { + vandq_u32(left, right) + } + + #[inline] + unsafe fn load_unaligned(addr: *const DataType) -> DataType { + vld1q_u32(addr.cast::()) + } + + #[inline] + unsafe fn store_unaligned(addr: *mut DataType, data: DataType) { + vst1q_u32(addr.cast::(), data); + } + + #[inline] + /// Collapses the vector by performing a bitwise OR across all lanes + unsafe fn or_collapse_to_u32(acc: DataType) -> u32 { + vgetq_lane_u32(acc, 0) + | vgetq_lane_u32(acc, 1) + | vgetq_lane_u32(acc, 2) + | vgetq_lane_u32(acc, 3) + } - use super::scalar::add; - use super::scalar::left_shift_32; - use super::scalar::load_unaligned; - use super::scalar::op_and; - use super::scalar::op_or; - use super::scalar::or_collapse_to_u32; - use super::scalar::right_shift_32; - use super::scalar::set1; - use super::scalar::store_unaligned; - use super::scalar::sub; - use super::scalar::DataType; - use std::arch::aarch64::{vaddq_u32, vdupq_n_u32, vextq_u32, vld1q_u32, vst1q_u32, vsubq_u32}; - - #[target_feature(enable = "neon")] unsafe fn compute_delta(curr: DataType, prev: DataType) -> DataType { - let c = vld1q_u32(curr.as_ptr()); - let p = vld1q_u32(prev.as_ptr()); - let mut r = set1(0); - vst1q_u32(r.as_mut_ptr(), vsubq_u32(c, vextq_u32(p, c, 3))); - r + // Build a vector with [prev[3], curr[0], curr[1], curr[2]] + let prev_shifted = vextq_u32(prev, curr, 3); + vsubq_u32(curr, prev_shifted) } - #[target_feature(enable = "neon")] #[allow(non_snake_case)] #[inline] unsafe fn integrate_delta(prev: DataType, delta: DataType) -> DataType { - let base = vdupq_n_u32(prev[3]); + let base = vdupq_n_u32(vgetq_lane_u32(prev, 3)); let zero = vdupq_n_u32(0); - let a__b__c__d_ = vld1q_u32(delta.as_ptr()); + let a__b__c__d_ = delta; let ______a__b_ = vextq_u32(zero, a__b__c__d_, 2); let a__b__ca_db = vaddq_u32(______a__b_, a__b__c__d_); let ___a__b__ca = vextq_u32(zero, a__b__ca_db, 3); let a_ab_abc_abcd = vaddq_u32(___a__b__ca, a__b__ca_db); - let mut r = set1(0); - vst1q_u32(r.as_mut_ptr(), vaddq_u32(base, a_ab_abc_abcd)); - r + vaddq_u32(base, a_ab_abc_abcd) + } + + #[inline] + unsafe fn add(left: DataType, right: DataType) -> DataType { + vaddq_u32(left, right) } - // TODO trinity-1686a: I believe add/sub are easy enough for the compiler to optimize on its - // own, and suspect hand-rolled impl would force (un)loading registers and make things slower - // overall + #[inline] + unsafe fn sub(left: DataType, right: DataType) -> DataType { + vsubq_u32(left, right) + } declare_bitpacker!(target_feature(enable = "neon")); diff --git a/src/bitpacking_bench.rs b/src/bitpacking_bench.rs index 9cd8c30..eb09c61 100644 --- a/src/bitpacking_bench.rs +++ b/src/bitpacking_bench.rs @@ -1,10 +1,13 @@ -use criterion::{Bencher, Criterion, criterion_group, criterion_main}; +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use std::time::Duration; use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x}; use criterion::Benchmark; use criterion::Throughput; const NUM_BLOCKS: usize = 10; +const SAMPLE_SIZE: usize = 10; +const WARM_UP_TIME: Duration = Duration::from_millis(50); fn integrate_data(initial: u32, data: &mut [u32]) { let mut cumul = initial; @@ -245,6 +248,8 @@ fn criterion_benchmark_bitpacker( Benchmark::new(format!("decompress-{num_bit}").as_str(), move |b| { bench_decompress_util::(bitpacker, b, &num_bits[..]); }) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -254,6 +259,8 @@ fn criterion_benchmark_bitpacker( Benchmark::new(format!("decompress-delta-{num_bit}").as_str(), move |b| { bench_decompress_delta_util::(bitpacker, b, &num_bits[..]); }) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -266,6 +273,8 @@ fn criterion_benchmark_bitpacker( bench_decompress_strict_delta_util::(bitpacker, b, &num_bits[..]); }, ) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -275,6 +284,8 @@ fn criterion_benchmark_bitpacker( Benchmark::new(format!("compress-{num_bit}").as_str(), move |b| { bench_compress_util::(bitpacker, b, &num_bits[..]); }) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -284,6 +295,8 @@ fn criterion_benchmark_bitpacker( Benchmark::new(format!("compress-delta-{num_bit}").as_str(), move |b| { bench_compress_delta_util::(bitpacker, b, &num_bits[..]); }) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -296,6 +309,8 @@ fn criterion_benchmark_bitpacker( bench_compress_strict_delta_util::(bitpacker, b, &num_bits[..]); }, ) + .warm_up_time(WARM_UP_TIME) + .sample_size(SAMPLE_SIZE) .throughput(Throughput::Elements( (NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64, )), @@ -309,5 +324,9 @@ fn criterion_benchmark(criterion: &mut Criterion) { criterion_benchmark_bitpacker("BitPacker8x", BitPacker8x::new(), criterion); } -criterion_group!(benches, criterion_benchmark); +criterion_group! { + name = benches; + config = Criterion::default().warm_up_time(Duration::from_millis(50)); + targets = criterion_benchmark +} criterion_main!(benches); diff --git a/src/tests.rs b/src/tests.rs index fc15694..0fb9280 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,8 +1,8 @@ +use super::most_significant_bit; +use super::UnsafeBitPacker; use rand::distributions::{Distribution as _, Uniform}; use rand::rngs::StdRng; use rand::SeedableRng as _; -use super::most_significant_bit; -use super::UnsafeBitPacker; pub fn generate_array(n: usize, max_num_bits: u8) -> Vec { assert!(max_num_bits <= 32u8); From 7e8a0091201abdcef29ace158eb25b78d8072c6a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 6 Jan 2026 14:26:33 +0100 Subject: [PATCH 2/3] compile time shifts --- .github/workflows/ci.yml | 30 -------------- src/bitpacker4x.rs | 88 ++++++++++++++++++++++++++++++++++------ 2 files changed, 76 insertions(+), 42 deletions(-) delete mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 989c972..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: CI - -on: - push: - pull_request: - -jobs: - test: - name: test (${{ matrix.name }}) - runs-on: ${{ matrix.runs-on }} - strategy: - fail-fast: false - matrix: - include: - - name: x86_64 - runs-on: ubuntu-latest - - name: arm64 - runs-on: ubuntu-22.04-arm - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - - - name: Rust cache - uses: Swatinem/rust-cache@v2 - - - name: Run tests - run: cargo test diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs index d5a6c3a..d493fe9 100644 --- a/src/bitpacker4x.rs +++ b/src/bitpacker4x.rs @@ -84,8 +84,8 @@ mod neon { use super::BLOCK_LEN; use crate::Available; use std::arch::aarch64::{ - uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_s32, vdupq_n_u32, vextq_u32, vgetq_lane_u32, - vld1q_u32, vorrq_u32, vshlq_u32, vst1q_u32, vsubq_u32, + uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_u32, vextq_u32, vgetq_lane_u32, vld1q_u32, + vorrq_u32, vshlq_n_u32, vshrq_n_u32, vst1q_u32, vsubq_u32, }; pub(crate) type DataType = uint32x4_t; @@ -98,20 +98,84 @@ mod neon { #[inline] unsafe fn right_shift_32(el: DataType) -> DataType { - // Create a vector with all elements set to -N - // negative shift amount means right shift - // Each lane is shifted by the corresponding value in the shift vector - let shift = vdupq_n_s32(-N); - vshlq_u32(el, shift) + // We unroll here because vshrq_n_u32 only accepts constants from 1 to 32. + match N { + 0 => el, + 1 => vshrq_n_u32::<1>(el), + 2 => vshrq_n_u32::<2>(el), + 3 => vshrq_n_u32::<3>(el), + 4 => vshrq_n_u32::<4>(el), + 5 => vshrq_n_u32::<5>(el), + 6 => vshrq_n_u32::<6>(el), + 7 => vshrq_n_u32::<7>(el), + 8 => vshrq_n_u32::<8>(el), + 9 => vshrq_n_u32::<9>(el), + 10 => vshrq_n_u32::<10>(el), + 11 => vshrq_n_u32::<11>(el), + 12 => vshrq_n_u32::<12>(el), + 13 => vshrq_n_u32::<13>(el), + 14 => vshrq_n_u32::<14>(el), + 15 => vshrq_n_u32::<15>(el), + 16 => vshrq_n_u32::<16>(el), + 17 => vshrq_n_u32::<17>(el), + 18 => vshrq_n_u32::<18>(el), + 19 => vshrq_n_u32::<19>(el), + 20 => vshrq_n_u32::<20>(el), + 21 => vshrq_n_u32::<21>(el), + 22 => vshrq_n_u32::<22>(el), + 23 => vshrq_n_u32::<23>(el), + 24 => vshrq_n_u32::<24>(el), + 25 => vshrq_n_u32::<25>(el), + 26 => vshrq_n_u32::<26>(el), + 27 => vshrq_n_u32::<27>(el), + 28 => vshrq_n_u32::<28>(el), + 29 => vshrq_n_u32::<29>(el), + 30 => vshrq_n_u32::<30>(el), + 31 => vshrq_n_u32::<31>(el), + 32 => vshrq_n_u32::<32>(el), + _ => core::hint::unreachable_unchecked(), + } } #[inline] unsafe fn left_shift_32(el: DataType) -> DataType { - // Create a vector with all elements set to N - // positive shift amount means left shift - // Each lane is shifted by the corresponding value in the shift vector - let shift = vdupq_n_s32(N); - vshlq_u32(el, shift) + // We unroll here because vshlq_n_u32 only accepts constants from 1 to 32. + match N { + 0 => el, + 1 => vshlq_n_u32::<1>(el), + 2 => vshlq_n_u32::<2>(el), + 3 => vshlq_n_u32::<3>(el), + 4 => vshlq_n_u32::<4>(el), + 5 => vshlq_n_u32::<5>(el), + 6 => vshlq_n_u32::<6>(el), + 7 => vshlq_n_u32::<7>(el), + 8 => vshlq_n_u32::<8>(el), + 9 => vshlq_n_u32::<9>(el), + 10 => vshlq_n_u32::<10>(el), + 11 => vshlq_n_u32::<11>(el), + 12 => vshlq_n_u32::<12>(el), + 13 => vshlq_n_u32::<13>(el), + 14 => vshlq_n_u32::<14>(el), + 15 => vshlq_n_u32::<15>(el), + 16 => vshlq_n_u32::<16>(el), + 17 => vshlq_n_u32::<17>(el), + 18 => vshlq_n_u32::<18>(el), + 19 => vshlq_n_u32::<19>(el), + 20 => vshlq_n_u32::<20>(el), + 21 => vshlq_n_u32::<21>(el), + 22 => vshlq_n_u32::<22>(el), + 23 => vshlq_n_u32::<23>(el), + 24 => vshlq_n_u32::<24>(el), + 25 => vshlq_n_u32::<25>(el), + 26 => vshlq_n_u32::<26>(el), + 27 => vshlq_n_u32::<27>(el), + 28 => vshlq_n_u32::<28>(el), + 29 => vshlq_n_u32::<29>(el), + 30 => vshlq_n_u32::<30>(el), + 31 => vshlq_n_u32::<31>(el), + 32 => vdupq_n_u32(0), + _ => core::hint::unreachable_unchecked(), + } } #[inline] From 855806bda8ea8bac24b953ad87b057bfb53c76b6 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Thu, 8 Jan 2026 14:39:38 +0100 Subject: [PATCH 3/3] apply cr comments --- src/bitpacker4x.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs index d493fe9..7cbe5f4 100644 --- a/src/bitpacker4x.rs +++ b/src/bitpacker4x.rs @@ -98,6 +98,11 @@ mod neon { #[inline] unsafe fn right_shift_32(el: DataType) -> DataType { + const { + assert!(N >= 0); + assert!(N <= 32); + } + // We unroll here because vshrq_n_u32 only accepts constants from 1 to 32. match N { 0 => el, @@ -132,14 +137,19 @@ mod neon { 29 => vshrq_n_u32::<29>(el), 30 => vshrq_n_u32::<30>(el), 31 => vshrq_n_u32::<31>(el), - 32 => vshrq_n_u32::<32>(el), + 32 => vdupq_n_u32(0), _ => core::hint::unreachable_unchecked(), } } #[inline] unsafe fn left_shift_32(el: DataType) -> DataType { - // We unroll here because vshlq_n_u32 only accepts constants from 1 to 32. + const { + assert!(N >= 0); + assert!(N <= 32); + } + + // We unroll here because vshlq_n_u32 only accepts constants from 0 to 31. match N { 0 => el, 1 => vshlq_n_u32::<1>(el), @@ -178,11 +188,7 @@ mod neon { } } - #[inline] - unsafe fn op_or(left: DataType, right: DataType) -> DataType { - // Bitwise OR of two vectors - vorrq_u32(left, right) - } + use vorrq_u32 as op_or; #[inline] unsafe fn op_and(left: DataType, right: DataType) -> DataType {