Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 145 additions & 28 deletions src/bitpacker4x.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,48 +83,165 @@ mod neon {

use super::BLOCK_LEN;
use crate::Available;
use std::arch::aarch64::{
uint32x4_t, vaddq_u32, vandq_u32, vdupq_n_u32, vextq_u32, vgetq_lane_u32, vld1q_u32,
vorrq_u32, vshlq_n_u32, vshrq_n_u32, vst1q_u32, vsubq_u32,
};

pub(crate) type DataType = uint32x4_t;

#[inline]
/// Creates a vector with all elements set to `el`.
unsafe fn set1(el: i32) -> DataType {
vdupq_n_u32(el as u32)
}

#[inline]
unsafe fn right_shift_32<const N: i32>(el: DataType) -> DataType {
const {
assert!(N >= 0);
assert!(N <= 32);
}

// We unroll here because vshrq_n_u32 only accepts constants from 1 to 32.
match N {
0 => el,
1 => vshrq_n_u32::<1>(el),
2 => vshrq_n_u32::<2>(el),
3 => vshrq_n_u32::<3>(el),
4 => vshrq_n_u32::<4>(el),
5 => vshrq_n_u32::<5>(el),
6 => vshrq_n_u32::<6>(el),
7 => vshrq_n_u32::<7>(el),
8 => vshrq_n_u32::<8>(el),
9 => vshrq_n_u32::<9>(el),
10 => vshrq_n_u32::<10>(el),
11 => vshrq_n_u32::<11>(el),
12 => vshrq_n_u32::<12>(el),
13 => vshrq_n_u32::<13>(el),
14 => vshrq_n_u32::<14>(el),
15 => vshrq_n_u32::<15>(el),
16 => vshrq_n_u32::<16>(el),
17 => vshrq_n_u32::<17>(el),
18 => vshrq_n_u32::<18>(el),
19 => vshrq_n_u32::<19>(el),
20 => vshrq_n_u32::<20>(el),
21 => vshrq_n_u32::<21>(el),
22 => vshrq_n_u32::<22>(el),
23 => vshrq_n_u32::<23>(el),
24 => vshrq_n_u32::<24>(el),
25 => vshrq_n_u32::<25>(el),
26 => vshrq_n_u32::<26>(el),
27 => vshrq_n_u32::<27>(el),
28 => vshrq_n_u32::<28>(el),
29 => vshrq_n_u32::<29>(el),
30 => vshrq_n_u32::<30>(el),
31 => vshrq_n_u32::<31>(el),
32 => vdupq_n_u32(0),
_ => core::hint::unreachable_unchecked(),
}
}

#[inline]
unsafe fn left_shift_32<const N: i32>(el: DataType) -> DataType {
const {
assert!(N >= 0);
assert!(N <= 32);
}

// We unroll here because vshlq_n_u32 only accepts constants from 0 to 31.
match N {
0 => el,
1 => vshlq_n_u32::<1>(el),
2 => vshlq_n_u32::<2>(el),
3 => vshlq_n_u32::<3>(el),
4 => vshlq_n_u32::<4>(el),
5 => vshlq_n_u32::<5>(el),
6 => vshlq_n_u32::<6>(el),
7 => vshlq_n_u32::<7>(el),
8 => vshlq_n_u32::<8>(el),
9 => vshlq_n_u32::<9>(el),
10 => vshlq_n_u32::<10>(el),
11 => vshlq_n_u32::<11>(el),
12 => vshlq_n_u32::<12>(el),
13 => vshlq_n_u32::<13>(el),
14 => vshlq_n_u32::<14>(el),
15 => vshlq_n_u32::<15>(el),
16 => vshlq_n_u32::<16>(el),
17 => vshlq_n_u32::<17>(el),
18 => vshlq_n_u32::<18>(el),
19 => vshlq_n_u32::<19>(el),
20 => vshlq_n_u32::<20>(el),
21 => vshlq_n_u32::<21>(el),
22 => vshlq_n_u32::<22>(el),
23 => vshlq_n_u32::<23>(el),
24 => vshlq_n_u32::<24>(el),
25 => vshlq_n_u32::<25>(el),
26 => vshlq_n_u32::<26>(el),
27 => vshlq_n_u32::<27>(el),
28 => vshlq_n_u32::<28>(el),
29 => vshlq_n_u32::<29>(el),
30 => vshlq_n_u32::<30>(el),
31 => vshlq_n_u32::<31>(el),
32 => vdupq_n_u32(0),
_ => core::hint::unreachable_unchecked(),
}
}

use vorrq_u32 as op_or;

#[inline]
unsafe fn op_and(left: DataType, right: DataType) -> DataType {
vandq_u32(left, right)
}

#[inline]
unsafe fn load_unaligned(addr: *const DataType) -> DataType {
vld1q_u32(addr.cast::<u32>())
}

#[inline]
unsafe fn store_unaligned(addr: *mut DataType, data: DataType) {
vst1q_u32(addr.cast::<u32>(), data);
}

#[inline]
/// Collapses the vector by performing a bitwise OR across all lanes
unsafe fn or_collapse_to_u32(acc: DataType) -> u32 {
vgetq_lane_u32(acc, 0)
| vgetq_lane_u32(acc, 1)
| vgetq_lane_u32(acc, 2)
| vgetq_lane_u32(acc, 3)
}

use super::scalar::add;
use super::scalar::left_shift_32;
use super::scalar::load_unaligned;
use super::scalar::op_and;
use super::scalar::op_or;
use super::scalar::or_collapse_to_u32;
use super::scalar::right_shift_32;
use super::scalar::set1;
use super::scalar::store_unaligned;
use super::scalar::sub;
use super::scalar::DataType;
use std::arch::aarch64::{vaddq_u32, vdupq_n_u32, vextq_u32, vld1q_u32, vst1q_u32, vsubq_u32};

#[target_feature(enable = "neon")]
unsafe fn compute_delta(curr: DataType, prev: DataType) -> DataType {
let c = vld1q_u32(curr.as_ptr());
let p = vld1q_u32(prev.as_ptr());
let mut r = set1(0);
vst1q_u32(r.as_mut_ptr(), vsubq_u32(c, vextq_u32(p, c, 3)));
r
// Build a vector with [prev[3], curr[0], curr[1], curr[2]]
let prev_shifted = vextq_u32(prev, curr, 3);
vsubq_u32(curr, prev_shifted)
}

#[target_feature(enable = "neon")]
#[allow(non_snake_case)]
#[inline]
unsafe fn integrate_delta(prev: DataType, delta: DataType) -> DataType {
let base = vdupq_n_u32(prev[3]);
let base = vdupq_n_u32(vgetq_lane_u32(prev, 3));
let zero = vdupq_n_u32(0);
let a__b__c__d_ = vld1q_u32(delta.as_ptr());
let a__b__c__d_ = delta;
let ______a__b_ = vextq_u32(zero, a__b__c__d_, 2);
let a__b__ca_db = vaddq_u32(______a__b_, a__b__c__d_);
let ___a__b__ca = vextq_u32(zero, a__b__ca_db, 3);
let a_ab_abc_abcd = vaddq_u32(___a__b__ca, a__b__ca_db);
let mut r = set1(0);
vst1q_u32(r.as_mut_ptr(), vaddq_u32(base, a_ab_abc_abcd));
r
vaddq_u32(base, a_ab_abc_abcd)
}

// TODO trinity-1686a: I believe add/sub are easy enough for the compiler to optimize on its
// own, and suspect hand-rolled impl would force (un)loading registers and make things slower
// overall
#[inline]
unsafe fn add(left: DataType, right: DataType) -> DataType {
vaddq_u32(left, right)
}

#[inline]
unsafe fn sub(left: DataType, right: DataType) -> DataType {
vsubq_u32(left, right)
}

declare_bitpacker!(target_feature(enable = "neon"));

Expand Down
23 changes: 21 additions & 2 deletions src/bitpacking_bench.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
use std::time::Duration;

use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
use criterion::Benchmark;
use criterion::Throughput;

const NUM_BLOCKS: usize = 10;
const SAMPLE_SIZE: usize = 10;
const WARM_UP_TIME: Duration = Duration::from_millis(50);

fn integrate_data(initial: u32, data: &mut [u32]) {
let mut cumul = initial;
Expand Down Expand Up @@ -245,6 +248,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
Benchmark::new(format!("decompress-{num_bit}").as_str(), move |b| {
bench_decompress_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
})
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -254,6 +259,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
Benchmark::new(format!("decompress-delta-{num_bit}").as_str(), move |b| {
bench_decompress_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
})
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -266,6 +273,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
bench_decompress_strict_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
},
)
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -275,6 +284,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
Benchmark::new(format!("compress-{num_bit}").as_str(), move |b| {
bench_compress_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
})
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -284,6 +295,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
Benchmark::new(format!("compress-delta-{num_bit}").as_str(), move |b| {
bench_compress_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
})
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -296,6 +309,8 @@ fn criterion_benchmark_bitpacker<TBitPacker: BitPacker + 'static>(
bench_compress_strict_delta_util::<TBitPacker>(bitpacker, b, &num_bits[..]);
},
)
.warm_up_time(WARM_UP_TIME)
.sample_size(SAMPLE_SIZE)
.throughput(Throughput::Elements(
(NUM_BLOCKS * TBitPacker::BLOCK_LEN) as u64,
)),
Expand All @@ -309,5 +324,9 @@ fn criterion_benchmark(criterion: &mut Criterion) {
criterion_benchmark_bitpacker("BitPacker8x", BitPacker8x::new(), criterion);
}

criterion_group!(benches, criterion_benchmark);
criterion_group! {
name = benches;
config = Criterion::default().warm_up_time(Duration::from_millis(50));
targets = criterion_benchmark
}
criterion_main!(benches);
4 changes: 2 additions & 2 deletions src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use super::most_significant_bit;
use super::UnsafeBitPacker;
use rand::distributions::{Distribution as _, Uniform};
use rand::rngs::StdRng;
use rand::SeedableRng as _;
use super::most_significant_bit;
use super::UnsafeBitPacker;

pub fn generate_array(n: usize, max_num_bits: u8) -> Vec<u32> {
assert!(max_num_bits <= 32u8);
Expand Down