From 7c9df0bf420d4ae23b1b041b220ea797e8ae21b6 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Thu, 31 Oct 2024 17:07:20 -0700 Subject: [PATCH 01/11] WIP: layer functions --- cuda/include/gkr.cuh | 16 +++ cuda/src/gkr.cu | 31 +++++ stwo_gpu_backend/src/cuda/bindings.rs | 23 ++++ .../src/examples/wide_fibonacci.rs | 2 +- stwo_gpu_backend/src/lookups/gkr.rs | 111 +++++++++++++++++- 5 files changed, 180 insertions(+), 3 deletions(-) diff --git a/cuda/include/gkr.cuh b/cuda/include/gkr.cuh index 88a65d8..247f423 100644 --- a/cuda/include/gkr.cuh +++ b/cuda/include/gkr.cuh @@ -3,7 +3,23 @@ #include "fields.cuh" +template +struct Fraction{ + T numerator; + qm31 denominator; +}; + +__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs); +__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs); + extern "C" void gen_eq_evals(qm31 v, qm31 *y, uint32_t y_size, qm31 *evals, uint32_t evals_size); +void next_grand_product_layer(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size); + +void next_logup_generic_layer( + qm31 *numerators, uint32_t numerators_size, qm31 *denominators, uint32_t denominators_size, + qm31 *next_numerators, uint32_t next_numerators_size, qm31 *next_denominators, uint32_t next_denominators_size +); + #endif // GKR_H diff --git a/cuda/src/gkr.cu b/cuda/src/gkr.cu index af52b76..1a9a394 100644 --- a/cuda/src/gkr.cu +++ b/cuda/src/gkr.cu @@ -36,3 +36,34 @@ void gen_eq_evals(qm31 v, qm31 *y, uint32_t y_size, qm31 *evals, uint32_t evals_ cudaFree(factors_device); } + +__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { + qm31 numerator = add(mul(lhs.numerator, rhs.denominator), mul(rhs.numerator, lhs.denominator)); + qm31 denominator = mul(lhs.denominator, rhs.denominator); + return Fraction {numerator, denominator}; +} + +__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { + qm31 numerator = add(mul(lhs.numerator, rhs.denominator), mul(rhs.numerator, lhs.denominator)); + qm31 denominator = mul(lhs.denominator, rhs.denominator); + return Fraction {numerator, denominator}; +} + + +__global__ void next_grand_product_layer_kernel(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid < next_layer_size) { + next_layer[tid] = mul(layer[tid * 2], layer[tid * 2 + 1]); + } +} + +void next_grand_product_layer(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (layer_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + uint32_t next_layer_size = layer_size / 2; + qm31 *next_layer = (qm31 *)malloc(sizeof(qm31) * next_layer_size); + next_grand_product_layer_kernel<<>>(layer, layer_size, next_layer, next_layer_size); + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/stwo_gpu_backend/src/cuda/bindings.rs b/stwo_gpu_backend/src/cuda/bindings.rs index 9d5b096..100e923 100644 --- a/stwo_gpu_backend/src/cuda/bindings.rs +++ b/stwo_gpu_backend/src/cuda/bindings.rs @@ -5,6 +5,8 @@ use stwo_prover::core::{ fields::{m31::BaseField, qm31::SecureField}, }; +use super::{BaseFieldVec, SecureFieldVec}; + #[repr(C)] pub struct CudaSecureField { a: BaseField, @@ -268,4 +270,25 @@ extern "C" { extended_domain_size: u32, number_of_columns: u32, ); + + pub fn next_grand_product_layer( + layer: *const CudaSecureField, + layer_size: usize, + next_layer: *const CudaSecureField, + next_layer_size: usize, + ); + + pub fn next_logup_generic_layer( + numerators: *const SecureFieldVec, + numerators_size: usize, + denominators: *const SecureFieldVec, + denominators_size: usize, + ); + + pub fn next_secure_logup_layer( + numerators: *const BaseFieldVec, + numerators_size: usize, + denominators: *const SecureFieldVec, + denominators_size: usize, + ); } diff --git a/stwo_gpu_backend/src/examples/wide_fibonacci.rs b/stwo_gpu_backend/src/examples/wide_fibonacci.rs index 3941eac..b718a01 100644 --- a/stwo_gpu_backend/src/examples/wide_fibonacci.rs +++ b/stwo_gpu_backend/src/examples/wide_fibonacci.rs @@ -200,7 +200,7 @@ mod test { } } - #[test_log::test] + //#[test_log::test] fn test_cuda_constraints_for_wide_fib_prove() { // Note: To see time measurement, run test with // RUST_LOG_SPAN_EVENTS=enter,close RUST_LOG=info RUST_BACKTRACE=1 diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 1e71156..25422b3 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -2,7 +2,7 @@ use crate::CudaBackend; use stwo_prover::core::{ fields::{m31::BaseField, qm31::SecureField}, lookups::{ - gkr_prover::GkrOps, + gkr_prover::{GkrOps, Layer}, mle::{Mle, MleOps}, }, }; @@ -30,7 +30,18 @@ impl GkrOps for CudaBackend { fn next_layer( layer: &stwo_prover::core::lookups::gkr_prover::Layer, ) -> stwo_prover::core::lookups::gkr_prover::Layer { - todo!() + match layer { + Layer::GrandProduct(col) => next_grand_product_layer(col), + Layer::LogUpGeneric { + numerators, + denominators, + } => next_logup_generic_layer(numerators, denominators), + Layer::LogUpMultiplicities { + numerators, + denominators, + } => next_logup_multiplicities_layer(numerators, denominators), + Layer::LogUpSingles { denominators } => next_logup_singles_layer(denominators), + } } fn sum_as_poly_in_first_variable( @@ -41,6 +52,43 @@ impl GkrOps for CudaBackend { } } +fn next_grand_product_layer(layer: &Mle) -> Layer { + let next_layer_size = layer.size / 2; + let next_layer = SecureFieldVec::new_uninitialized(next_layer_size); + unsafe { + bindings::next_grand_product_layer(layer.device_ptr, layer.size, next_layer.device_ptr, next_layer_size); + } + Layer::GrandProduct(Mle::new(next_layer)) +} + +fn next_logup_generic_layer( + numerators: MleExpr<'_, F>, + denominators: &Mle, +) -> Layer +where + F: Field, + SecureField: ExtensionOf, + CudaBackend: MleOps, +{ + let next_layer_len = denominators.len() / 2; + let next_numerators = SecureFieldVec::new_uninitialized(next_layer_len); + let next_denominators = SecureFieldVec::new_uninitialized(next_layer_len); + + + for i in 0..half_n { + let a = Fraction::new(numerators[i * 2], denominators[i * 2]); + let b = Fraction::new(numerators[i * 2 + 1], denominators[i * 2 + 1]); + let res = a + b; + next_numerators.push(res.numerator); + next_denominators.push(res.denominator); + } + + Layer::LogUpGeneric { + numerators: Mle::new(next_numerators), + denominators: Mle::new(next_denominators), + } +} + mod tests { use itertools::Itertools; use crate::CudaBackend; @@ -49,6 +97,18 @@ mod tests { use stwo_prover::core::fields::qm31::SecureField; use stwo_prover::core::lookups::gkr_prover::GkrOps; + + use stwo_prover::core::backend::simd::SimdBackend; + // use stwo_prover::core::backend::{Column, CpuBackend}; + // use stwo_prover::core::channel::Channel; + // use stwo_prover::core::fields::m31::BaseField; + // use stwo_prover::core::fields::qm31::SecureField; + use stwo_prover::core::lookups::gkr_prover::{prove_batch, Layer}; + use stwo_prover::core::lookups::gkr_verifier::{partially_verify_batch, Gate, GkrArtifact, GkrError}; + use stwo_prover::core::lookups::mle::Mle; + use stwo_prover::core::lookups::utils::Fraction; + use stwo_prover::core::channel::{Blake2sChannel, Channel}; + #[test] fn gen_eq_evals_matches_cpu() { let two = BaseField::from(2).into(); @@ -63,4 +123,51 @@ mod tests { assert_eq!(gpu_eq_evals.to_cpu(), *cpu_eq_evals); } + + // #[test] + // fn grand_product_works() { + // const N: usize = 1 << 8; + // let values = Blake2sChannel::default().draw_felts(N); + // let product = values.iter().product(); + // let col = Mle::::new(values.into_iter().collect()); + // let input_layer = Layer::GrandProduct(col.clone()); + // let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![input_layer]); + + // let GkrArtifact { + // ood_point, + // claims_to_verify_by_instance, + // n_variables_by_instance: _, + // } = partially_verify_batch(vec![Gate::GrandProduct], &proof, &mut Blake2sChannel::default()).unwrap(); + + // assert_eq!(proof.output_claims_by_instance, [vec![product]]); + // assert_eq!( + // claims_to_verify_by_instance, + // [vec![eval_at_point(&col, &ood_point)]] + // ); + // } + + pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField { + pub fn eval(mle_evals: &[SecureField], p: &[SecureField]) -> SecureField { + match p { + [] => mle_evals[0], + &[p_i, ref p @ ..] => { + let (lhs, rhs) = mle_evals.split_at(mle_evals.len() / 2); + let lhs_eval = eval(lhs, p); + let rhs_eval = eval(rhs, p); + // Equivalent to `eq(0, p_i) * lhs_eval + eq(1, p_i) * rhs_eval`. + p_i * (rhs_eval - lhs_eval) + lhs_eval + } + } + } + + let mle_evals = input + .clone() + .into_evals() + .to_cpu() + .into_iter() + .map(|v| v.into()) + .collect::>(); + + eval(&mle_evals, point) + } } \ No newline at end of file From 20e8968391279e3c28c430273b9519efcde11a26 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Sun, 3 Nov 2024 01:36:03 -0700 Subject: [PATCH 02/11] WIP: templating eval_grand_product_sum --- stwo_gpu_backend/src/column.rs | 12 +- stwo_gpu_backend/src/cuda/bindings.rs | 36 +++-- stwo_gpu_backend/src/lookups/gkr.rs | 221 +++++++++++++++++++------- stwo_gpu_backend/src/lookups/mle.rs | 2 +- 4 files changed, 198 insertions(+), 73 deletions(-) diff --git a/stwo_gpu_backend/src/column.rs b/stwo_gpu_backend/src/column.rs index 2d3bd72..e071112 100644 --- a/stwo_gpu_backend/src/column.rs +++ b/stwo_gpu_backend/src/column.rs @@ -4,7 +4,7 @@ use stwo_prover::core::{ fields::{m31::BaseField, qm31::SecureField}, }; -use crate::cuda::{bindings, BaseFieldVec}; +use crate::cuda::{bindings, BaseFieldVec, SecureFieldVec}; use crate::{backend::CudaBackend, cuda}; impl ColumnOps for CudaBackend { @@ -83,8 +83,9 @@ impl Column for cuda::SecureFieldVec { self.size } - fn at(&self, _index: usize) -> SecureField { - todo!() + fn at(&self, index: usize) -> SecureField { + // TODO: call binding to return value directly + self.to_cpu()[index] } fn set(&mut self, _index: usize, _value: SecureField) { @@ -100,8 +101,9 @@ impl Column for cuda::SecureFieldVec { } impl FromIterator for cuda::SecureFieldVec { - fn from_iter>(_iter: T) -> Self { - todo!() + fn from_iter>(iter: T) -> Self { + let secure_field_vec: Vec = iter.into_iter().collect(); + SecureFieldVec::from_vec(secure_field_vec) } } diff --git a/stwo_gpu_backend/src/cuda/bindings.rs b/stwo_gpu_backend/src/cuda/bindings.rs index 100e923..7e98ce0 100644 --- a/stwo_gpu_backend/src/cuda/bindings.rs +++ b/stwo_gpu_backend/src/cuda/bindings.rs @@ -7,6 +7,11 @@ use stwo_prover::core::{ use super::{BaseFieldVec, SecureFieldVec}; +#[repr(C)] +pub struct CudaBaseField { + a: BaseField, +} + #[repr(C)] pub struct CudaSecureField { a: BaseField, @@ -279,16 +284,29 @@ extern "C" { ); pub fn next_logup_generic_layer( - numerators: *const SecureFieldVec, - numerators_size: usize, - denominators: *const SecureFieldVec, - denominators_size: usize, + numerators: *const CudaSecureField, + denominators: *const CudaSecureField, + size: usize, + next_numerators: *const CudaSecureField, + next_denominators: *const CudaSecureField, + next_size: usize, ); - pub fn next_secure_logup_layer( - numerators: *const BaseFieldVec, - numerators_size: usize, - denominators: *const SecureFieldVec, - denominators_size: usize, + pub fn next_logup_multiplicities_layer( + numerators: *const CudaBaseField, + denominators: *const CudaSecureField, + size: usize, + next_numerators: *const CudaSecureField, + next_denominators: *const CudaSecureField, + next_size: usize, ); + + pub fn next_logup_singles_layer( + denominators: *const CudaSecureField, + size: usize, + next_numerators: *const CudaSecureField, + next_denominators: *const CudaSecureField, + next_size: usize, + ); + } diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 25422b3..12b68b0 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -1,14 +1,16 @@ use crate::CudaBackend; +use num_traits::Zero; use stwo_prover::core::{ - fields::{m31::BaseField, qm31::SecureField}, - lookups::{ - gkr_prover::{GkrOps, Layer}, - mle::{Mle, MleOps}, - }, + backend::Column, fields::{m31::BaseField, qm31::SecureField}, lookups::{ + gkr_prover::{correct_sum_as_poly_in_first_variable, GkrOps, Layer}, + mle::{Mle, MleOps}, sumcheck::MultivariatePolyOracle, + } }; -use crate::cuda::{bindings, SecureFieldVec}; +use crate::cuda::{bindings, BaseFieldVec, SecureFieldVec}; use crate::cuda::bindings::CudaSecureField; +use self::bindings::CudaBaseField; + impl GkrOps for CudaBackend { fn gen_eq_evals(y: &[SecureField], v: SecureField) -> Mle { let y_size = y.len(); @@ -30,58 +32,153 @@ impl GkrOps for CudaBackend { fn next_layer( layer: &stwo_prover::core::lookups::gkr_prover::Layer, ) -> stwo_prover::core::lookups::gkr_prover::Layer { - match layer { - Layer::GrandProduct(col) => next_grand_product_layer(col), - Layer::LogUpGeneric { - numerators, - denominators, - } => next_logup_generic_layer(numerators, denominators), - Layer::LogUpMultiplicities { - numerators, - denominators, - } => next_logup_multiplicities_layer(numerators, denominators), - Layer::LogUpSingles { denominators } => next_logup_singles_layer(denominators), + + if let Layer::GrandProduct(col) = layer { + next_grand_product_layer(col) + } + else { + layer.clone() } + // match layer { + // Layer::GrandProduct(col) => next_grand_product_layer(col), + // Layer::LogUpGeneric { + // numerators, + // denominators, + // } => next_logup_generic_layer::(numerators, denominators), + // Layer::LogUpMultiplicities { + // numerators, + // denominators, + // } => next_logup_multiplicities_layer::(numerators, denominators), + // Layer::LogUpSingles { denominators } => next_logup_singles_layer::(denominators), + // } } fn sum_as_poly_in_first_variable( h: &stwo_prover::core::lookups::gkr_prover::GkrMultivariatePolyOracle<'_, Self>, claim: SecureField, ) -> stwo_prover::core::lookups::utils::UnivariatePoly { - todo!() + // 1. Create each layer sum check + // 2. Polynomial interpolation check + + let n_variables = h.n_variables(); + assert!(!n_variables.is_zero()); + let n_terms = 1 << (n_variables - 1); + let eq_evals = h.eq_evals.as_ref(); + // Vector used to generate evaluations of `eq(x, y)` for `x` in the boolean hypercube. + let y = eq_evals.y(); + let lambda = h.lambda; + + let (mut eval_at_0, mut eval_at_2) = { + if let Layer::GrandProduct(col) = &h.input_layer { + eval_grand_product_sum(eq_evals, col, n_terms) + } + }; + + // let (mut eval_at_0, mut eval_at_2) = match &h.input_layer { + // Layer::GrandProduct(col) => eval_grand_product_sum(eq_evals, col, n_terms), + // Layer::LogUpGeneric { + // numerators, + // denominators, + // } => eval_logup_sum(eq_evals, numerators, denominators, n_terms, lambda), + // Layer::LogUpMultiplicities { + // numerators, + // denominators, + // } => eval_logup_sum(eq_evals, numerators, denominators, n_terms, lambda), + // Layer::LogUpSingles { denominators } => { + // eval_logup_singles_sum(eq_evals, denominators, n_terms, lambda) + // } + // }; + + eval_at_0 *= h.eq_fixed_var_correction; + eval_at_2 *= h.eq_fixed_var_correction; + correct_sum_as_poly_in_first_variable(eval_at_0, eval_at_2, claim, y, n_variables) } } -fn next_grand_product_layer(layer: &Mle) -> Layer { +fn eval_grand_product_sum( + eq_evals: &EqEvals, + input_layer: &Mle, + n_terms: usize, +) -> (SecureField, SecureField) { + +fn next_grand_product_layer(layer: &Mle) -> Layer { let next_layer_size = layer.size / 2; let next_layer = SecureFieldVec::new_uninitialized(next_layer_size); + unsafe { - bindings::next_grand_product_layer(layer.device_ptr, layer.size, next_layer.device_ptr, next_layer_size); - } + bindings::next_grand_product_layer( + layer.device_ptr as *const CudaSecureField, + layer.size, + next_layer.device_ptr as *const CudaSecureField, + next_layer_size); + }; + Layer::GrandProduct(Mle::new(next_layer)) } fn next_logup_generic_layer( - numerators: MleExpr<'_, F>, + numerators: &Mle, denominators: &Mle, -) -> Layer -where - F: Field, - SecureField: ExtensionOf, - CudaBackend: MleOps, -{ +) -> Layer { let next_layer_len = denominators.len() / 2; let next_numerators = SecureFieldVec::new_uninitialized(next_layer_len); let next_denominators = SecureFieldVec::new_uninitialized(next_layer_len); - - for i in 0..half_n { - let a = Fraction::new(numerators[i * 2], denominators[i * 2]); - let b = Fraction::new(numerators[i * 2 + 1], denominators[i * 2 + 1]); - let res = a + b; - next_numerators.push(res.numerator); - next_denominators.push(res.denominator); + unsafe { + bindings::next_logup_generic_layer( + numerators.device_ptr as *const CudaSecureField, + denominators.device_ptr as *const CudaSecureField, + numerators.size, + next_numerators.device_ptr as *const CudaSecureField, + next_denominators.device_ptr as *const CudaSecureField, + next_layer_len); + }; + + Layer::LogUpGeneric { + numerators: Mle::new(next_numerators), + denominators: Mle::new(next_denominators), } +} + +fn next_logup_multiplicities_layer( + numerators: &Mle, + denominators: &Mle, +) -> Layer { + let next_layer_len = denominators.len() / 2; + let next_numerators = SecureFieldVec::new_uninitialized(next_layer_len); + let next_denominators = SecureFieldVec::new_uninitialized(next_layer_len); + + unsafe { + bindings::next_logup_multiplicities_layer( + numerators.device_ptr as *const CudaBaseField, + denominators.device_ptr as *const CudaSecureField, + numerators.size, + next_numerators.device_ptr as *const CudaSecureField, + next_denominators.device_ptr as *const CudaSecureField, + next_layer_len); + }; + + Layer::LogUpGeneric { + numerators: Mle::new(next_numerators), + denominators: Mle::new(next_denominators), + } +} + +fn next_logup_singles_layer( + denominators: &Mle, +) -> Layer { + let next_layer_len = denominators.len() / 2; + let next_numerators = SecureFieldVec::new_uninitialized(next_layer_len); + let next_denominators = SecureFieldVec::new_uninitialized(next_layer_len); + + unsafe { + bindings::next_logup_singles_layer( + denominators.device_ptr as *const CudaSecureField, + denominators.size, + next_numerators.device_ptr as *const CudaSecureField, + next_denominators.device_ptr as *const CudaSecureField, + next_layer_len); + }; Layer::LogUpGeneric { numerators: Mle::new(next_numerators), @@ -124,29 +221,37 @@ mod tests { assert_eq!(gpu_eq_evals.to_cpu(), *cpu_eq_evals); } - // #[test] - // fn grand_product_works() { - // const N: usize = 1 << 8; - // let values = Blake2sChannel::default().draw_felts(N); - // let product = values.iter().product(); - // let col = Mle::::new(values.into_iter().collect()); - // let input_layer = Layer::GrandProduct(col.clone()); - // let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![input_layer]); - - // let GkrArtifact { - // ood_point, - // claims_to_verify_by_instance, - // n_variables_by_instance: _, - // } = partially_verify_batch(vec![Gate::GrandProduct], &proof, &mut Blake2sChannel::default()).unwrap(); - - // assert_eq!(proof.output_claims_by_instance, [vec![product]]); - // assert_eq!( - // claims_to_verify_by_instance, - // [vec![eval_at_point(&col, &ood_point)]] - // ); - // } - - pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField { + #[test] + fn grand_product_works() { + + const N: usize = 1 << 8; + let values = Blake2sChannel::default().draw_felts(N); + + let product = values.iter().product(); + + let col_gpu = Mle::::new(values.clone().into_iter().collect()); + let col_cpu = Mle::::new(values.into_iter().collect()); + + let input_layer = Layer::GrandProduct(col_gpu.clone()); + println!("sd"); + + let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![input_layer]); + + let GkrArtifact { + ood_point, + claims_to_verify_by_instance, + n_variables_by_instance: _, + } = partially_verify_batch(vec![Gate::GrandProduct], &proof, &mut Blake2sChannel::default()).unwrap(); + + // let col = Mle::::new(col.to_cpu()); + assert_eq!(proof.output_claims_by_instance, [vec![product]]); + assert_eq!( + claims_to_verify_by_instance, + [vec![eval_at_point(&col_cpu, &ood_point)]] + ); + } + + pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField { pub fn eval(mle_evals: &[SecureField], p: &[SecureField]) -> SecureField { match p { [] => mle_evals[0], diff --git a/stwo_gpu_backend/src/lookups/mle.rs b/stwo_gpu_backend/src/lookups/mle.rs index 8b2ea1a..eac3793 100644 --- a/stwo_gpu_backend/src/lookups/mle.rs +++ b/stwo_gpu_backend/src/lookups/mle.rs @@ -77,7 +77,7 @@ mod tests { let mle_cpu = Mle::::new(values); let random_assignment = SecureField::from_u32_unchecked(7, 12, 3, 2); let mle_fixed_cpu = MleOps::::fix_first_variable(mle_cpu, random_assignment); - + let mle_fixed_simd = MleOps::::fix_first_variable(mle_cuda, random_assignment); assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) From c759b035af3d05545452f205d7226e7d149994c4 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Mon, 4 Nov 2024 03:59:00 -0800 Subject: [PATCH 03/11] Added: gkr grand product --- stwo_gpu_backend/src/cuda/bindings.rs | 8 +++++++ stwo_gpu_backend/src/lookups/gkr.rs | 34 +++++++++++++++++++++------ stwo_gpu_backend/src/lookups/mle.rs | 34 ++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/stwo_gpu_backend/src/cuda/bindings.rs b/stwo_gpu_backend/src/cuda/bindings.rs index 7e98ce0..8e92978 100644 --- a/stwo_gpu_backend/src/cuda/bindings.rs +++ b/stwo_gpu_backend/src/cuda/bindings.rs @@ -309,4 +309,12 @@ extern "C" { next_size: usize, ); + pub fn eval_grand_product_sum( + eq_evals: *const CudaSecureField, + input_layer: *const CudaSecureField, + n_terms: usize, + eval_at_0: *const CudaSecureField, + eval_at_2: *const CudaSecureField, + ); + } diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 12b68b0..748c7bd 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -2,7 +2,7 @@ use crate::CudaBackend; use num_traits::Zero; use stwo_prover::core::{ backend::Column, fields::{m31::BaseField, qm31::SecureField}, lookups::{ - gkr_prover::{correct_sum_as_poly_in_first_variable, GkrOps, Layer}, + gkr_prover::{correct_sum_as_poly_in_first_variable, EqEvals, GkrOps, Layer}, mle::{Mle, MleOps}, sumcheck::MultivariatePolyOracle, } }; @@ -37,6 +37,7 @@ impl GkrOps for CudaBackend { next_grand_product_layer(col) } else { + println!("not supposed to access"); layer.clone() } // match layer { @@ -72,6 +73,10 @@ impl GkrOps for CudaBackend { if let Layer::GrandProduct(col) = &h.input_layer { eval_grand_product_sum(eq_evals, col, n_terms) } + else { + println!("not supposed to access"); + (SecureField::default(), SecureField::default()) + } }; // let (mut eval_at_0, mut eval_at_2) = match &h.input_layer { @@ -100,6 +105,26 @@ fn eval_grand_product_sum( input_layer: &Mle, n_terms: usize, ) -> (SecureField, SecureField) { + // println!("n_terms: {}", n_terms); + // println!("eq_evals: {:?}", eq_evals); + // println!("mle_evals: {:?}", &*eq_evals.clone().to_cpu()); + // println!("input_layer: {:?}", input_layer.clone().into_evals().to_cpu()); + let eval_at_0 = SecureFieldVec::new_uninitialized(1); + let eval_at_2 = SecureFieldVec::new_uninitialized(1); + + unsafe { + bindings::eval_grand_product_sum( + eq_evals.device_ptr as *const CudaSecureField, + input_layer.device_ptr as *const CudaSecureField, + n_terms, + eval_at_0.device_ptr as *const CudaSecureField, + eval_at_2.device_ptr as *const CudaSecureField); + }; + + // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) +} + fn next_grand_product_layer(layer: &Mle) -> Layer { let next_layer_size = layer.size / 2; @@ -223,27 +248,22 @@ mod tests { #[test] fn grand_product_works() { - - const N: usize = 1 << 8; + const N: usize = 1 << 5; let values = Blake2sChannel::default().draw_felts(N); - let product = values.iter().product(); let col_gpu = Mle::::new(values.clone().into_iter().collect()); let col_cpu = Mle::::new(values.into_iter().collect()); let input_layer = Layer::GrandProduct(col_gpu.clone()); - println!("sd"); let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![input_layer]); - let GkrArtifact { ood_point, claims_to_verify_by_instance, n_variables_by_instance: _, } = partially_verify_batch(vec![Gate::GrandProduct], &proof, &mut Blake2sChannel::default()).unwrap(); - // let col = Mle::::new(col.to_cpu()); assert_eq!(proof.output_claims_by_instance, [vec![product]]); assert_eq!( claims_to_verify_by_instance, diff --git a/stwo_gpu_backend/src/lookups/mle.rs b/stwo_gpu_backend/src/lookups/mle.rs index eac3793..63bf160 100644 --- a/stwo_gpu_backend/src/lookups/mle.rs +++ b/stwo_gpu_backend/src/lookups/mle.rs @@ -16,7 +16,7 @@ impl MleOps for CudaBackend { ) -> Mle where Self: MleOps, - { + { let evals_size = mle.len(); let result_evals = SecureFieldVec::new_uninitialized(evals_size >> 1); unsafe { @@ -39,6 +39,8 @@ impl MleOps for CudaBackend { where Self: MleOps, { + println!("mle {:?}", mle.clone().to_cpu()); + let evals_size = mle.len(); let result_evals = SecureFieldVec::new_uninitialized(evals_size >> 1); unsafe { @@ -49,6 +51,8 @@ impl MleOps for CudaBackend { result_evals.device_ptr, ) } + println!("evals {:?}\n", result_evals.clone().to_cpu()); + Mle::new(result_evals) } } @@ -103,4 +107,32 @@ mod tests { assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) } + + + #[test] + fn fix_first_variable_with_secure_field_mle_matches_cpu_test() { + const N_VARIABLES: u32 = 8; + + let values = (0..(1 << N_VARIABLES) * 4) + .collect_vec() + .chunks(4) + .map(|a| SecureField::from_u32_unchecked(a[0], a[1], a[2], a[3])) + .collect_vec(); + let values = vec![ + SecureField::from_u32_unchecked(582331529, 613033660, 368449590, 1538560393), + SecureField::from_u32_unchecked(251659788, 78954062, 1801023691, 998786282), + SecureField::from_u32_unchecked(1184369939, 1548123522, 496550933, 2088405710), + SecureField::from_u32_unchecked(1158835792, 413758273, 961347360, 1684210072), + ]; + + let mle_cuda = + Mle::::new(SecureFieldVec::from_vec(values.clone())); + let mle_cpu = Mle::::new(values); + let random_assignment = SecureField::from_u32_unchecked(7, 12, 3, 2); + let mle_fixed_cpu = MleOps::::fix_first_variable(mle_cpu, random_assignment); + + let mle_fixed_simd = MleOps::::fix_first_variable(mle_cuda, random_assignment); + + assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) + } } From 362046352c93a77c67165ea6fd94d3b08edd1e84 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Mon, 4 Nov 2024 05:26:47 -0800 Subject: [PATCH 04/11] WIP: logup generic --- stwo_gpu_backend/src/cuda/bindings.rs | 29 +++++++++ stwo_gpu_backend/src/lookups/gkr.rs | 87 +++++++++++++++++++++++++-- stwo_gpu_backend/src/lookups/mle.rs | 3 - 3 files changed, 111 insertions(+), 8 deletions(-) diff --git a/stwo_gpu_backend/src/cuda/bindings.rs b/stwo_gpu_backend/src/cuda/bindings.rs index 8e92978..7723875 100644 --- a/stwo_gpu_backend/src/cuda/bindings.rs +++ b/stwo_gpu_backend/src/cuda/bindings.rs @@ -317,4 +317,33 @@ extern "C" { eval_at_2: *const CudaSecureField, ); + pub fn eval_logup_generic_sum( + eq_evals: *const CudaSecureField, + numerators: *const CudaSecureField, + denominators: *const CudaSecureField, + n_terms: usize, + lambda: CudaSecureField, + eval_at_0: *const CudaSecureField, + eval_at_2: *const CudaSecureField, + ); + + pub fn eval_logup_multiplicities_sum( + eq_evals: *const CudaSecureField, + numerators: *const CudaBaseField, + denominators: *const CudaSecureField, + n_terms: usize, + lambda: CudaSecureField, + eval_at_0: *const CudaSecureField, + eval_at_2: *const CudaSecureField, + ); + + pub fn eval_logup_singles_sum( + eq_evals: *const CudaSecureField, + denominators: *const CudaSecureField, + n_terms: usize, + lambda: CudaSecureField, + eval_at_0: *const CudaSecureField, + eval_at_2: *const CudaSecureField, + ); + } diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 748c7bd..da74fc8 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -36,6 +36,9 @@ impl GkrOps for CudaBackend { if let Layer::GrandProduct(col) = layer { next_grand_product_layer(col) } + else if let Layer::LogUpGeneric {numerators, denominators} = layer { + next_logup_generic_layer::(numerators, denominators) + } else { println!("not supposed to access"); layer.clone() @@ -73,6 +76,9 @@ impl GkrOps for CudaBackend { if let Layer::GrandProduct(col) = &h.input_layer { eval_grand_product_sum(eq_evals, col, n_terms) } + else if let Layer::LogUpGeneric {numerators, denominators} = &h.input_layer { + eval_logup_generic_sum(eq_evals, numerators, denominators, n_terms, lambda) + } else { println!("not supposed to access"); (SecureField::default(), SecureField::default()) @@ -105,10 +111,6 @@ fn eval_grand_product_sum( input_layer: &Mle, n_terms: usize, ) -> (SecureField, SecureField) { - // println!("n_terms: {}", n_terms); - // println!("eq_evals: {:?}", eq_evals); - // println!("mle_evals: {:?}", &*eq_evals.clone().to_cpu()); - // println!("input_layer: {:?}", input_layer.clone().into_evals().to_cpu()); let eval_at_0 = SecureFieldVec::new_uninitialized(1); let eval_at_2 = SecureFieldVec::new_uninitialized(1); @@ -121,7 +123,37 @@ fn eval_grand_product_sum( eval_at_2.device_ptr as *const CudaSecureField); }; - // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) +} + +fn eval_logup_generic_sum( + eq_evals: &EqEvals, + numerators: &Mle, + denominators: &Mle, + n_terms: usize, + lambda: SecureField, +) -> (SecureField, SecureField) { + let eval_at_0 = SecureFieldVec::new_uninitialized(1); + let eval_at_2 = SecureFieldVec::new_uninitialized(1); + + println!("n_terms: {}", n_terms); + println!("eq_evals: {:?}", eq_evals); + println!("mle evals: {:?}", &*eq_evals.to_cpu()); + println!("numerators: {:?}", numerators.clone().to_cpu()); + println!("denominators: {:?}", denominators.clone().to_cpu()); + + unsafe { + bindings::eval_logup_generic_sum( + eq_evals.device_ptr as *const CudaSecureField, + denominators.device_ptr as *const CudaSecureField, + denominators.device_ptr as *const CudaSecureField, + n_terms, + CudaSecureField::from(lambda), + eval_at_0.device_ptr as *const CudaSecureField, + eval_at_2.device_ptr as *const CudaSecureField); + }; + println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) } @@ -212,7 +244,10 @@ fn next_logup_singles_layer( } mod tests { + use std::iter::zip; use itertools::Itertools; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; use crate::CudaBackend; use stwo_prover::core::backend::{Column, CpuBackend}; use stwo_prover::core::fields::m31::{BaseField, M31}; @@ -271,6 +306,48 @@ mod tests { ); } + + #[test] + fn logup_with_generic_trace_works() { + const N: usize = 1 << 2; + let mut rng = SmallRng::seed_from_u64(0); + let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); + let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); + let sum = zip(&numerator_values, &denominator_values) + .map(|(&n, &d)| Fraction::new(n, d)) + .sum::>(); + let numerators = Mle::::new(numerator_values.clone().into_iter().collect()); + let denominators = Mle::::new(denominator_values.clone().into_iter().collect()); + let numerators_cpu = Mle::::new(numerator_values.into_iter().collect()); + let denominators_cpu = Mle::::new(denominator_values.into_iter().collect()); + + let top_layer = Layer::LogUpGeneric { + numerators: numerators.clone(), + denominators: denominators.clone(), + }; + let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![top_layer]); + + let GkrArtifact { + ood_point, + claims_to_verify_by_instance, + n_variables_by_instance: _, + } = partially_verify_batch(vec![Gate::LogUp], &proof, &mut Blake2sChannel::default()).unwrap(); + + assert_eq!(claims_to_verify_by_instance.len(), 1); + assert_eq!(proof.output_claims_by_instance.len(), 1); + assert_eq!( + claims_to_verify_by_instance[0], + [ + eval_at_point(&numerators_cpu, &ood_point), + eval_at_point(&denominators_cpu, &ood_point) + ] + ); + assert_eq!( + proof.output_claims_by_instance[0], + [sum.numerator, sum.denominator] + ); + } + pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField { pub fn eval(mle_evals: &[SecureField], p: &[SecureField]) -> SecureField { match p { diff --git a/stwo_gpu_backend/src/lookups/mle.rs b/stwo_gpu_backend/src/lookups/mle.rs index 63bf160..42f129f 100644 --- a/stwo_gpu_backend/src/lookups/mle.rs +++ b/stwo_gpu_backend/src/lookups/mle.rs @@ -39,8 +39,6 @@ impl MleOps for CudaBackend { where Self: MleOps, { - println!("mle {:?}", mle.clone().to_cpu()); - let evals_size = mle.len(); let result_evals = SecureFieldVec::new_uninitialized(evals_size >> 1); unsafe { @@ -51,7 +49,6 @@ impl MleOps for CudaBackend { result_evals.device_ptr, ) } - println!("evals {:?}\n", result_evals.clone().to_cpu()); Mle::new(result_evals) } From e4eb541fa88d470cb44c008a8c416d85089410f4 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 13:17:51 -0800 Subject: [PATCH 05/11] Added: logup with single and multiple --- cuda/include/gkr.cuh | 96 ++++- cuda/src/gkr.cu | 594 +++++++++++++++++++++++++++- stwo_gpu_backend/src/lookups/gkr.rs | 259 ++++++++---- 3 files changed, 860 insertions(+), 89 deletions(-) diff --git a/cuda/include/gkr.cuh b/cuda/include/gkr.cuh index 247f423..e4c9970 100644 --- a/cuda/include/gkr.cuh +++ b/cuda/include/gkr.cuh @@ -3,23 +3,101 @@ #include "fields.cuh" +extern "C" +void gen_eq_evals(qm31 v, qm31 *y, uint32_t y_size, qm31 *evals, uint32_t evals_size); + template struct Fraction{ T numerator; qm31 denominator; + + __device__ Fraction() : numerator(T{}), denominator(qm31{}) {} + __device__ Fraction(T num, qm31 denom) : numerator(num), denominator(denom) {} }; -__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs); -__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs); +__device__ Fraction add_fraction(Fraction lhs, Fraction rhs); +__device__ Fraction add_fraction(Fraction lhs, Fraction rhs); -extern "C" -void gen_eq_evals(qm31 v, qm31 *y, uint32_t y_size, qm31 *evals, uint32_t evals_size); +template +struct Reciprocal{ + T x; + + __device__ Reciprocal() : x(T{}) {} + __device__ Reciprocal(T x) : x(x) {} +}; + +__device__ Fraction add_reciprocal(Reciprocal lhs, Reciprocal rhs); + +extern "C" { + void next_grand_product_layer( + qm31 *layer, + uint32_t layer_size, + qm31 *next_layer, + uint32_t next_layer_size + ); + + void next_logup_generic_layer( + qm31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size + ); + + void next_logup_multiplicities_layer( + m31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size + ); + + void next_logup_singles_layer( + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size + ); + + void eval_grand_product_sum( + qm31 *eq_evals, + qm31 *input_layer, + uint32_t n_terms, + qm31 *eval_at_0, + qm31 *eval_at_2 + ); + + void eval_logup_generic_sum( + qm31 *eq_evals, + qm31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 + ); -void next_grand_product_layer(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size); + void eval_logup_multiplicities_sum( + qm31 *eq_evals, + m31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 + ); -void next_logup_generic_layer( - qm31 *numerators, uint32_t numerators_size, qm31 *denominators, uint32_t denominators_size, - qm31 *next_numerators, uint32_t next_numerators_size, qm31 *next_denominators, uint32_t next_denominators_size -); + void eval_logup_singles_sum( + qm31 *eq_evals, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 + ); +} #endif // GKR_H diff --git a/cuda/src/gkr.cu b/cuda/src/gkr.cu index 1a9a394..17d95ee 100644 --- a/cuda/src/gkr.cu +++ b/cuda/src/gkr.cu @@ -1,3 +1,4 @@ +#include #include "utils.cuh" #include "gkr.cuh" @@ -37,22 +38,25 @@ void gen_eq_evals(qm31 v, qm31 *y, uint32_t y_size, qm31 *evals, uint32_t evals_ cudaFree(factors_device); } -__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { +__device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { qm31 numerator = add(mul(lhs.numerator, rhs.denominator), mul(rhs.numerator, lhs.denominator)); qm31 denominator = mul(lhs.denominator, rhs.denominator); return Fraction {numerator, denominator}; } -__host__ __device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { +__device__ Fraction add_fraction(Fraction lhs, Fraction rhs) { qm31 numerator = add(mul(lhs.numerator, rhs.denominator), mul(rhs.numerator, lhs.denominator)); qm31 denominator = mul(lhs.denominator, rhs.denominator); - return Fraction {numerator, denominator}; + return Fraction(numerator, denominator); } +__device__ Fraction add_reciprocal(Reciprocal lhs, Reciprocal rhs) { + // `1/a + 1/b = (a + b)/(a * b)` + return Fraction(add(lhs.x, rhs.x), mul(lhs.x, rhs.x)); +} __global__ void next_grand_product_layer_kernel(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size) { unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < next_layer_size) { next_layer[tid] = mul(layer[tid * 2], layer[tid * 2 + 1]); } @@ -60,10 +64,582 @@ __global__ void next_grand_product_layer_kernel(qm31 *layer, uint32_t layer_size void next_grand_product_layer(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size) { const unsigned int BLOCK_SIZE = 1024; - const unsigned int NUM_BLOCKS = (layer_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - - uint32_t next_layer_size = layer_size / 2; - qm31 *next_layer = (qm31 *)malloc(sizeof(qm31) * next_layer_size); + const unsigned int NUM_BLOCKS = (next_layer_size + BLOCK_SIZE - 1) / BLOCK_SIZE; next_grand_product_layer_kernel<<>>(layer, layer_size, next_layer, next_layer_size); cudaDeviceSynchronize(); -} \ No newline at end of file +} + +// optimize(daniel): use uint4 built in? +__global__ void eval_grand_product_sum_kernel( + qm31 *eq_evals, + qm31 *input_layer, + uint32_t n_terms, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // todo: specify size of shared memory + __shared__ qm31 shared_eval_0[1024]; + __shared__ qm31 shared_eval_2[1024]; + + shared_eval_0[threadIdx.x] = {0, 0, 0, 0}; + shared_eval_2[threadIdx.x] = {0, 0, 0, 0}; + __syncthreads(); + + if (tid < n_terms) { + qm31 inp_at_r0i0 = input_layer[tid * 2]; + qm31 inp_at_r0i1 = input_layer[tid * 2 + 1]; + qm31 inp_at_r1i0 = input_layer[(n_terms + tid) * 2]; + qm31 inp_at_r1i1 = input_layer[(n_terms + tid) * 2 + 1]; + + qm31 inp_at_r2i0 = sub(add(inp_at_r1i0, inp_at_r1i0), inp_at_r0i0); // inp_at_r2i0 + qm31 inp_at_r2i1 = sub(add(inp_at_r1i1, inp_at_r1i1), inp_at_r0i1); // inp_at_r2i1 + + qm31 prod_at_r2i = mul(inp_at_r2i0, inp_at_r2i1); // prod_at_r2i + qm31 prod_at_r0i = mul(inp_at_r0i0, inp_at_r0i1); // prod_at_r0i + + shared_eval_0[threadIdx.x] = mul(eq_evals[tid], prod_at_r0i); // eq_eval_at_0i * prod_at_r0i + shared_eval_2[threadIdx.x] = mul(eq_evals[tid], prod_at_r2i); // eq_eval_at_0i * prod_at_r2i + } + __syncthreads(); + + // Perform intra-block reduction in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_eval_0[threadIdx.x] = add(shared_eval_0[threadIdx.x], shared_eval_0[threadIdx.x + s]); + shared_eval_2[threadIdx.x] = add(shared_eval_2[threadIdx.x], shared_eval_2[threadIdx.x + s]); + } + __syncthreads(); + } + + // Set global memory for each block id, grab the reduced shared thread id w.r.t. each block + if (threadIdx.x == 0) { + eval_at_0[blockIdx.x] = shared_eval_0[threadIdx.x]; + eval_at_2[blockIdx.x] = shared_eval_2[threadIdx.x]; + } +} + +void eval_grand_product_sum( + qm31 *eq_evals, + qm31 *input_layer, + uint32_t n_terms, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Arrays for intra-block reduction + qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_0_temp_d; + qm31 *eval_at_2_temp_d; + + cudaMalloc((void **)&eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS); + cudaMalloc((void **)&eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS); + + size_t shared_mem_size = 2 * BLOCK_SIZE * sizeof(qm31); + eval_grand_product_sum_kernel<<>>( + eq_evals, + input_layer, + n_terms, + eval_at_0_temp_d, + eval_at_2_temp_d + ); + cudaDeviceSynchronize(); + + // Post intra-block reduction + // todo(daniel): move to kernel + cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + + for (int i = 1; i < NUM_BLOCKS; ++i) { + eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); + eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); + } + + cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + + free(eval_at_0_temp_h); + free(eval_at_2_temp_h); + cudaFree(eval_at_0_temp_d); + cudaFree(eval_at_2_temp_d); +} + +__global__ void next_logup_generic_layer_kernel( + qm31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); + Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); + + Fraction res = add_fraction(a, b); + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +void next_logup_generic_layer( + qm31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_generic_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + +__global__ void eval_logup_generic_sum_kernel( + qm31 *eq_evals, + qm31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // todo: specify size of shared memory + __shared__ qm31 shared_eval_0[512]; + __shared__ qm31 shared_eval_2[512]; + + shared_eval_0[threadIdx.x] = {0, 0, 0, 0}; + shared_eval_2[threadIdx.x] = {0, 0, 0, 0}; + __syncthreads(); + + if (tid < n_terms) { + qm31 inp_numer_at_r0i0 = numerators[tid * 2]; + qm31 inp_denom_at_r0i0 = denominators[tid * 2]; + qm31 inp_numer_at_r0i1 = numerators[tid * 2 + 1]; + qm31 inp_denom_at_r0i1 = denominators[tid * 2 + 1]; + qm31 inp_numer_at_r1i0 = numerators[(n_terms + tid) * 2]; + qm31 inp_denom_at_r1i0 = denominators[(n_terms + tid) * 2]; + qm31 inp_numer_at_r1i1 = numerators[(n_terms + tid) * 2 + 1]; + qm31 inp_denom_at_r1i1 = denominators[(n_terms + tid) * 2 + 1]; + + qm31 inp_numer_at_r2i0 = sub(add(inp_numer_at_r1i0, inp_numer_at_r1i0), inp_numer_at_r0i0); + qm31 inp_denom_at_r2i0 = sub(add(inp_denom_at_r1i0, inp_denom_at_r1i0), inp_denom_at_r0i0); + qm31 inp_numer_at_r2i1 = sub(add(inp_numer_at_r1i1, inp_numer_at_r1i1), inp_numer_at_r0i1); + qm31 inp_denom_at_r2i1 = sub(add(inp_denom_at_r1i1, inp_denom_at_r1i1), inp_denom_at_r0i1); + + Fraction fraction_eval_0 = add_fraction(Fraction(inp_numer_at_r0i0, inp_denom_at_r0i0), Fraction(inp_numer_at_r0i1, inp_denom_at_r0i1)); + Fraction fraction_eval_2 = add_fraction(Fraction(inp_numer_at_r2i0, inp_denom_at_r2i0), Fraction(inp_numer_at_r2i1, inp_denom_at_r2i1)); + + shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); + shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); + + // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; + // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; + } + __syncthreads(); + + // Perform intra-block reduction in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_eval_0[threadIdx.x] = add(shared_eval_0[threadIdx.x], shared_eval_0[threadIdx.x + s]); + shared_eval_2[threadIdx.x] = add(shared_eval_2[threadIdx.x], shared_eval_2[threadIdx.x + s]); + } + __syncthreads(); + } + + // Set global memory for each block id, grab the reduced shared thread id w.r.t. each block + if (threadIdx.x == 0) { + eval_at_0[blockIdx.x] = shared_eval_0[threadIdx.x]; + eval_at_2[blockIdx.x] = shared_eval_2[threadIdx.x]; + } + +} + +void eval_logup_generic_sum( + qm31 *eq_evals, + qm31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + const unsigned int BLOCK_SIZE = 512; + const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Arrays for intra-block reduction + qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_0_temp_d; + qm31 *eval_at_2_temp_d; + + cudaMalloc((void **)&eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS); + cudaMalloc((void **)&eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS); + + eval_logup_generic_sum_kernel<<>>( + eq_evals, + numerators, + denominators, + n_terms, + lambda, + eval_at_0_temp_d, + eval_at_2_temp_d + ); + // printf("ASDA\n\n\n"); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("eval_logup_generic_sum_kernel launch error: %s\n", cudaGetErrorString(err)); + } + + // Synchronize to catch any runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("eval_logup_generic_sum_kernel execution error: %s\n", cudaGetErrorString(err)); + } + + // Post intra-block reduction + cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + + // for (int i = 1; i < NUM_BLOCKS; ++i) { + // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); + // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); + // } + + // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); + // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); + // printf("ASDA\n\n\n"); + + + cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + + free(eval_at_0_temp_h); + free(eval_at_2_temp_h); + cudaFree(eval_at_0_temp_d); + cudaFree(eval_at_2_temp_d); +} + +__global__ void eval_logup_multiplicities_sum_kernel( + qm31 *eq_evals, + m31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // todo: specify size of shared memory + __shared__ qm31 shared_eval_0[512]; + __shared__ qm31 shared_eval_2[512]; + + shared_eval_0[threadIdx.x] = {0, 0, 0, 0}; + shared_eval_2[threadIdx.x] = {0, 0, 0, 0}; + __syncthreads(); + + if (tid < n_terms) { + m31 inp_numer_at_r0i0 = numerators[tid * 2]; + qm31 inp_denom_at_r0i0 = denominators[tid * 2]; + m31 inp_numer_at_r0i1 = numerators[tid * 2 + 1]; + qm31 inp_denom_at_r0i1 = denominators[tid * 2 + 1]; + m31 inp_numer_at_r1i0 = numerators[(n_terms + tid) * 2]; + qm31 inp_denom_at_r1i0 = denominators[(n_terms + tid) * 2]; + m31 inp_numer_at_r1i1 = numerators[(n_terms + tid) * 2 + 1]; + qm31 inp_denom_at_r1i1 = denominators[(n_terms + tid) * 2 + 1]; + + m31 inp_numer_at_r2i0 = sub(add(inp_numer_at_r1i0, inp_numer_at_r1i0), inp_numer_at_r0i0); + qm31 inp_denom_at_r2i0 = sub(add(inp_denom_at_r1i0, inp_denom_at_r1i0), inp_denom_at_r0i0); + m31 inp_numer_at_r2i1 = sub(add(inp_numer_at_r1i1, inp_numer_at_r1i1), inp_numer_at_r0i1); + qm31 inp_denom_at_r2i1 = sub(add(inp_denom_at_r1i1, inp_denom_at_r1i1), inp_denom_at_r0i1); + + Fraction fraction_eval_0 = add_fraction(Fraction(inp_numer_at_r0i0, inp_denom_at_r0i0), Fraction(inp_numer_at_r0i1, inp_denom_at_r0i1)); + Fraction fraction_eval_2 = add_fraction(Fraction(inp_numer_at_r2i0, inp_denom_at_r2i0), Fraction(inp_numer_at_r2i1, inp_denom_at_r2i1)); + + shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); + shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); + + // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; + // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; + } + __syncthreads(); + + // Perform intra-block reduction in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_eval_0[threadIdx.x] = add(shared_eval_0[threadIdx.x], shared_eval_0[threadIdx.x + s]); + shared_eval_2[threadIdx.x] = add(shared_eval_2[threadIdx.x], shared_eval_2[threadIdx.x + s]); + } + __syncthreads(); + } + + // Set global memory for each block id, grab the reduced shared thread id w.r.t. each block + if (threadIdx.x == 0) { + eval_at_0[blockIdx.x] = shared_eval_0[threadIdx.x]; + eval_at_2[blockIdx.x] = shared_eval_2[threadIdx.x]; + } + +} + +void eval_logup_multiplicities_sum( + qm31 *eq_evals, + m31 *numerators, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + const unsigned int BLOCK_SIZE = 512; + const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Arrays for intra-block reduction + qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_0_temp_d; + qm31 *eval_at_2_temp_d; + + cudaMalloc((void **)&eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS); + cudaMalloc((void **)&eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS); + + eval_logup_multiplicities_sum_kernel<<>>( + eq_evals, + numerators, + denominators, + n_terms, + lambda, + eval_at_0_temp_d, + eval_at_2_temp_d + ); + // printf("ASDA\n\n\n"); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("eval_logup_multiplicities_sum launch error: %s\n", cudaGetErrorString(err)); + } + + // Synchronize to catch any runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("eval_logup_multiplicities_sum execution error: %s\n", cudaGetErrorString(err)); + } + + // Post intra-block reduction + cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + + // for (int i = 1; i < NUM_BLOCKS; ++i) { + // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); + // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); + // } + + // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); + // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); + // printf("ASDA\n\n\n"); + + + cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + + free(eval_at_0_temp_h); + free(eval_at_2_temp_h); + cudaFree(eval_at_0_temp_d); + cudaFree(eval_at_2_temp_d); +} + +__global__ void eval_logup_singles_sum_kernel( + qm31 *eq_evals, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // todo: specify size of shared memory + __shared__ qm31 shared_eval_0[512]; + __shared__ qm31 shared_eval_2[512]; + + shared_eval_0[threadIdx.x] = {0, 0, 0, 0}; + shared_eval_2[threadIdx.x] = {0, 0, 0, 0}; + __syncthreads(); + + if (tid < n_terms) { + qm31 inp_denom_at_r0i0 = denominators[tid * 2]; + qm31 inp_denom_at_r0i1 = denominators[tid * 2 + 1]; + qm31 inp_denom_at_r1i0 = denominators[(n_terms + tid) * 2]; + qm31 inp_denom_at_r1i1 = denominators[(n_terms + tid) * 2 + 1]; + + qm31 inp_denom_at_r2i0 = sub(add(inp_denom_at_r1i0, inp_denom_at_r1i0), inp_denom_at_r0i0); + qm31 inp_denom_at_r2i1 = sub(add(inp_denom_at_r1i1, inp_denom_at_r1i1), inp_denom_at_r0i1); + + Fraction fraction_eval_0 = add_reciprocal(Reciprocal(inp_denom_at_r0i0), Reciprocal(inp_denom_at_r0i1)); + Fraction fraction_eval_2 = add_reciprocal(Reciprocal(inp_denom_at_r2i0), Reciprocal(inp_denom_at_r2i1)); + + shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); + shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); + + // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; + // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; + } + __syncthreads(); + + // Perform intra-block reduction in shared memory + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_eval_0[threadIdx.x] = add(shared_eval_0[threadIdx.x], shared_eval_0[threadIdx.x + s]); + shared_eval_2[threadIdx.x] = add(shared_eval_2[threadIdx.x], shared_eval_2[threadIdx.x + s]); + } + __syncthreads(); + } + + // Set global memory for each block id, grab the reduced shared thread id w.r.t. each block + if (threadIdx.x == 0) { + eval_at_0[blockIdx.x] = shared_eval_0[threadIdx.x]; + eval_at_2[blockIdx.x] = shared_eval_2[threadIdx.x]; + } + +} + +void eval_logup_singles_sum( + qm31 *eq_evals, + qm31 *denominators, + uint32_t n_terms, + qm31 lambda, + qm31 *eval_at_0, + qm31 *eval_at_2 +) { + const unsigned int BLOCK_SIZE = 512; + const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + // Arrays for intra-block reduction + qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); + qm31 *eval_at_0_temp_d; + qm31 *eval_at_2_temp_d; + + cudaMalloc((void **)&eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS); + cudaMalloc((void **)&eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS); + + eval_logup_singles_sum_kernel<<>>( + eq_evals, + denominators, + n_terms, + lambda, + eval_at_0_temp_d, + eval_at_2_temp_d + ); + // printf("ASDA\n\n\n"); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("eval_logup_singles_sum launch error: %s\n", cudaGetErrorString(err)); + } + + // Synchronize to catch any runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("eval_logup_singles_sum execution error: %s\n", cudaGetErrorString(err)); + } + + // Post intra-block reduction + cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); + + // for (int i = 1; i < NUM_BLOCKS; ++i) { + // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); + // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); + // } + + // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); + // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); + // printf("ASDA\n\n\n"); + + + cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + + free(eval_at_0_temp_h); + free(eval_at_2_temp_h); + cudaFree(eval_at_0_temp_d); + cudaFree(eval_at_2_temp_d); +} + +__global__ void next_logup_multiplicities_layer_kernel( + m31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); + Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); + + Fraction res = add_fraction(a, b); + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +__global__ void next_logup_singles_layer_kernel( + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Reciprocal even = Reciprocal(denominators[tid * 2]); + Reciprocal odd = Reciprocal(denominators[tid * 2 + 1]); + Fraction res = add_reciprocal(even, odd); + + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +void next_logup_multiplicities_layer( + m31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_multiplicities_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + +void next_logup_singles_layer( + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_singles_layer_kernel<<>>(denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index da74fc8..16a2c45 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -32,29 +32,18 @@ impl GkrOps for CudaBackend { fn next_layer( layer: &stwo_prover::core::lookups::gkr_prover::Layer, ) -> stwo_prover::core::lookups::gkr_prover::Layer { - - if let Layer::GrandProduct(col) = layer { - next_grand_product_layer(col) - } - else if let Layer::LogUpGeneric {numerators, denominators} = layer { - next_logup_generic_layer::(numerators, denominators) - } - else { - println!("not supposed to access"); - layer.clone() + match layer { + Layer::GrandProduct(col) => next_grand_product_layer(col), + Layer::LogUpGeneric { + numerators, + denominators, + } => next_logup_generic_layer::(numerators, denominators), + Layer::LogUpMultiplicities { + numerators, + denominators, + } => next_logup_multiplicities_layer::(numerators, denominators), + Layer::LogUpSingles { denominators } => next_logup_singles_layer::(denominators), } - // match layer { - // Layer::GrandProduct(col) => next_grand_product_layer(col), - // Layer::LogUpGeneric { - // numerators, - // denominators, - // } => next_logup_generic_layer::(numerators, denominators), - // Layer::LogUpMultiplicities { - // numerators, - // denominators, - // } => next_logup_multiplicities_layer::(numerators, denominators), - // Layer::LogUpSingles { denominators } => next_logup_singles_layer::(denominators), - // } } fn sum_as_poly_in_first_variable( @@ -72,33 +61,20 @@ impl GkrOps for CudaBackend { let y = eq_evals.y(); let lambda = h.lambda; - let (mut eval_at_0, mut eval_at_2) = { - if let Layer::GrandProduct(col) = &h.input_layer { - eval_grand_product_sum(eq_evals, col, n_terms) - } - else if let Layer::LogUpGeneric {numerators, denominators} = &h.input_layer { - eval_logup_generic_sum(eq_evals, numerators, denominators, n_terms, lambda) - } - else { - println!("not supposed to access"); - (SecureField::default(), SecureField::default()) + let (mut eval_at_0, mut eval_at_2) = match &h.input_layer { + Layer::GrandProduct(col) => eval_grand_product_sum(eq_evals, col, n_terms), + Layer::LogUpGeneric { + numerators, + denominators, + } => eval_logup_generic_sum(eq_evals, numerators, denominators, n_terms, lambda), + Layer::LogUpMultiplicities { + numerators, + denominators, + } => eval_logup_multiplicities_sum(eq_evals, numerators, denominators, n_terms, lambda), + Layer::LogUpSingles { denominators } => { + eval_logup_singles_sum(eq_evals, denominators, n_terms, lambda) } }; - - // let (mut eval_at_0, mut eval_at_2) = match &h.input_layer { - // Layer::GrandProduct(col) => eval_grand_product_sum(eq_evals, col, n_terms), - // Layer::LogUpGeneric { - // numerators, - // denominators, - // } => eval_logup_sum(eq_evals, numerators, denominators, n_terms, lambda), - // Layer::LogUpMultiplicities { - // numerators, - // denominators, - // } => eval_logup_sum(eq_evals, numerators, denominators, n_terms, lambda), - // Layer::LogUpSingles { denominators } => { - // eval_logup_singles_sum(eq_evals, denominators, n_terms, lambda) - // } - // }; eval_at_0 *= h.eq_fixed_var_correction; eval_at_2 *= h.eq_fixed_var_correction; @@ -136,27 +112,86 @@ fn eval_logup_generic_sum( let eval_at_0 = SecureFieldVec::new_uninitialized(1); let eval_at_2 = SecureFieldVec::new_uninitialized(1); - println!("n_terms: {}", n_terms); - println!("eq_evals: {:?}", eq_evals); - println!("mle evals: {:?}", &*eq_evals.to_cpu()); - println!("numerators: {:?}", numerators.clone().to_cpu()); - println!("denominators: {:?}", denominators.clone().to_cpu()); + // println!("n_terms: {}", n_terms); + // println!("eq_evals: {:?}", eq_evals); + // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); + // println!("numerators: {:?}", numerators.clone().to_cpu()); + // println!("denominators: {:?}", denominators.clone().to_cpu()); unsafe { bindings::eval_logup_generic_sum( eq_evals.device_ptr as *const CudaSecureField, + numerators.device_ptr as *const CudaSecureField, denominators.device_ptr as *const CudaSecureField, + n_terms, + CudaSecureField::from(lambda), + eval_at_0.device_ptr as *const CudaSecureField, + eval_at_2.device_ptr as *const CudaSecureField); + }; + // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + + (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) +} + +fn eval_logup_multiplicities_sum( + eq_evals: &EqEvals, + numerators: &Mle, + denominators: &Mle, + n_terms: usize, + lambda: SecureField, +) -> (SecureField, SecureField) { + let eval_at_0 = SecureFieldVec::new_uninitialized(1); + let eval_at_2 = SecureFieldVec::new_uninitialized(1); + + // println!("n_terms: {}", n_terms); + // println!("eq_evals: {:?}", eq_evals); + // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); + // println!("numerators: {:?}", numerators.clone().to_cpu()); + // println!("denominators: {:?}", denominators.clone().to_cpu()); + + unsafe { + bindings::eval_logup_multiplicities_sum( + eq_evals.device_ptr as *const CudaSecureField, + numerators.device_ptr as *const CudaBaseField, denominators.device_ptr as *const CudaSecureField, n_terms, CudaSecureField::from(lambda), eval_at_0.device_ptr as *const CudaSecureField, eval_at_2.device_ptr as *const CudaSecureField); }; - println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) } +fn eval_logup_singles_sum( + eq_evals: &EqEvals, + denominators: &Mle, + n_terms: usize, + lambda: SecureField, +) -> (SecureField, SecureField) { + let eval_at_0 = SecureFieldVec::new_uninitialized(1); + let eval_at_2 = SecureFieldVec::new_uninitialized(1); + + // println!("n_terms: {}", n_terms); + // println!("eq_evals: {:?}", eq_evals); + // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); + // println!("numerators: {:?}", numerators.clone().to_cpu()); + // println!("denominators: {:?}", denominators.clone().to_cpu()); + + unsafe { + bindings::eval_logup_singles_sum( + eq_evals.device_ptr as *const CudaSecureField, + denominators.device_ptr as *const CudaSecureField, + n_terms, + CudaSecureField::from(lambda), + eval_at_0.device_ptr as *const CudaSecureField, + eval_at_2.device_ptr as *const CudaSecureField); + }; + // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); + + (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) +} fn next_grand_product_layer(layer: &Mle) -> Layer { let next_layer_size = layer.size / 2; @@ -167,7 +202,8 @@ fn next_grand_product_layer(layer: &Mle) -> Layer( bindings::next_logup_generic_layer( numerators.device_ptr as *const CudaSecureField, denominators.device_ptr as *const CudaSecureField, - numerators.size, + denominators.size, next_numerators.device_ptr as *const CudaSecureField, next_denominators.device_ptr as *const CudaSecureField, - next_layer_len); + next_layer_len + ); }; Layer::LogUpGeneric { @@ -209,10 +246,11 @@ fn next_logup_multiplicities_layer( bindings::next_logup_multiplicities_layer( numerators.device_ptr as *const CudaBaseField, denominators.device_ptr as *const CudaSecureField, - numerators.size, + denominators.size, next_numerators.device_ptr as *const CudaSecureField, next_denominators.device_ptr as *const CudaSecureField, - next_layer_len); + next_layer_len + ); }; Layer::LogUpGeneric { @@ -234,7 +272,8 @@ fn next_logup_singles_layer( denominators.size, next_numerators.device_ptr as *const CudaSecureField, next_denominators.device_ptr as *const CudaSecureField, - next_layer_len); + next_layer_len + ); }; Layer::LogUpGeneric { @@ -246,23 +285,18 @@ fn next_logup_singles_layer( mod tests { use std::iter::zip; use itertools::Itertools; + use num_traits::One; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; + use stwo_prover::core::fields::{ExtensionOf, Field}; use crate::CudaBackend; use stwo_prover::core::backend::{Column, CpuBackend}; use stwo_prover::core::fields::m31::{BaseField, M31}; use stwo_prover::core::fields::qm31::SecureField; - use stwo_prover::core::lookups::gkr_prover::GkrOps; - - use stwo_prover::core::backend::simd::SimdBackend; - // use stwo_prover::core::backend::{Column, CpuBackend}; - // use stwo_prover::core::channel::Channel; - // use stwo_prover::core::fields::m31::BaseField; - // use stwo_prover::core::fields::qm31::SecureField; - use stwo_prover::core::lookups::gkr_prover::{prove_batch, Layer}; + use stwo_prover::core::lookups::gkr_prover::{prove_batch, GkrOps, Layer}; use stwo_prover::core::lookups::gkr_verifier::{partially_verify_batch, Gate, GkrArtifact, GkrError}; - use stwo_prover::core::lookups::mle::Mle; + use stwo_prover::core::lookups::mle::{Mle, MleOps}; use stwo_prover::core::lookups::utils::Fraction; use stwo_prover::core::channel::{Blake2sChannel, Channel}; @@ -306,10 +340,9 @@ mod tests { ); } - #[test] fn logup_with_generic_trace_works() { - const N: usize = 1 << 2; + const N: usize = 1 << 5; let mut rng = SmallRng::seed_from_u64(0); let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); @@ -348,7 +381,91 @@ mod tests { ); } - pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField { + #[test] + fn logup_with_multiplicities_trace_works() { + const N: usize = 1 << 5; + let mut rng = SmallRng::seed_from_u64(0); + let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); + let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); + let sum = zip(&numerator_values, &denominator_values) + .map(|(&n, &d)| Fraction::new(n.into(), d)) + .sum::>(); + let numerators = Mle::::new(numerator_values.clone().into_iter().collect()); + let denominators = Mle::::new(denominator_values.clone().into_iter().collect()); + let numerators_cpu = Mle::::new(numerator_values.into_iter().collect()); + let denominators_cpu = Mle::::new(denominator_values.into_iter().collect()); + + let top_layer = Layer::LogUpMultiplicities { + numerators: numerators.clone(), + denominators: denominators.clone(), + }; + let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![top_layer]); + + let GkrArtifact { + ood_point, + claims_to_verify_by_instance, + n_variables_by_instance: _, + } = partially_verify_batch(vec![Gate::LogUp], &proof, &mut Blake2sChannel::default()).unwrap(); + + assert_eq!(claims_to_verify_by_instance.len(), 1); + assert_eq!(proof.output_claims_by_instance.len(), 1); + assert_eq!( + claims_to_verify_by_instance[0], + [ + eval_at_point(&numerators_cpu.into(), &ood_point), + eval_at_point(&denominators_cpu, &ood_point) + ] + ); + assert_eq!( + proof.output_claims_by_instance[0], + [sum.numerator, sum.denominator] + ); + } + + #[test] + fn logup_with_singles_trace_works() { + const N: usize = 1 << 5; + let mut rng = SmallRng::seed_from_u64(0); + let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); + let sum = denominator_values + .iter() + .map(|&d| Fraction::new(SecureField::one(), d)) + .sum::>(); + let denominators = Mle::::new(denominator_values.clone().into_iter().collect()); + let denominators_cpu = Mle::::new(denominator_values.into_iter().collect()); + + let top_layer = Layer::LogUpSingles { + denominators: denominators.clone(), + }; + let (proof, _) = prove_batch(&mut Blake2sChannel::default(), vec![top_layer]); + + let GkrArtifact { + ood_point, + claims_to_verify_by_instance, + n_variables_by_instance: _, + } = partially_verify_batch(vec![Gate::LogUp], &proof, &mut Blake2sChannel::default()).unwrap(); + + assert_eq!(claims_to_verify_by_instance.len(), 1); + assert_eq!(proof.output_claims_by_instance.len(), 1); + assert_eq!( + claims_to_verify_by_instance[0], + [ + SecureField::one(), + eval_at_point(&denominators_cpu, &ood_point) + ] + ); + assert_eq!( + proof.output_claims_by_instance[0], + [sum.numerator, sum.denominator] + ); + } + + pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField + where + F: Field, + SecureField: ExtensionOf, + B: MleOps, + { pub fn eval(mle_evals: &[SecureField], p: &[SecureField]) -> SecureField { match p { [] => mle_evals[0], From 2db5e805632024625d2f3f0106b7678e9e2ad2c0 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 13:28:52 -0800 Subject: [PATCH 06/11] Fix: Post intra-block reduction --- stwo_gpu_backend/src/lookups/gkr.rs | 123 ++++++++++++---------------- 1 file changed, 52 insertions(+), 71 deletions(-) diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 16a2c45..1c692c6 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -1,15 +1,23 @@ use crate::CudaBackend; +use crate::cuda::{bindings, BaseFieldVec, SecureFieldVec}; +use crate::cuda::bindings::{CudaBaseField, CudaSecureField}; + use num_traits::Zero; + use stwo_prover::core::{ - backend::Column, fields::{m31::BaseField, qm31::SecureField}, lookups::{ + backend::Column, + fields::{ + m31::BaseField, + qm31::SecureField, + }, + lookups::{ gkr_prover::{correct_sum_as_poly_in_first_variable, EqEvals, GkrOps, Layer}, - mle::{Mle, MleOps}, sumcheck::MultivariatePolyOracle, - } -}; -use crate::cuda::{bindings, BaseFieldVec, SecureFieldVec}; -use crate::cuda::bindings::CudaSecureField; + mle::{Mle, MleOps}, + sumcheck::MultivariatePolyOracle, + },}; + -use self::bindings::CudaBaseField; +const SINGLE_EVALUATION_SIZE: usize = 1; impl GkrOps for CudaBackend { fn gen_eq_evals(y: &[SecureField], v: SecureField) -> Mle { @@ -33,16 +41,14 @@ impl GkrOps for CudaBackend { layer: &stwo_prover::core::lookups::gkr_prover::Layer, ) -> stwo_prover::core::lookups::gkr_prover::Layer { match layer { - Layer::GrandProduct(col) => next_grand_product_layer(col), - Layer::LogUpGeneric { - numerators, - denominators, - } => next_logup_generic_layer::(numerators, denominators), - Layer::LogUpMultiplicities { - numerators, - denominators, - } => next_logup_multiplicities_layer::(numerators, denominators), - Layer::LogUpSingles { denominators } => next_logup_singles_layer::(denominators), + Layer::GrandProduct(col) => + next_grand_product_layer(col), + Layer::LogUpGeneric {numerators, denominators} => + next_logup_generic_layer::(numerators, denominators), + Layer::LogUpMultiplicities {numerators,denominators} => + next_logup_multiplicities_layer::(numerators, denominators), + Layer::LogUpSingles { denominators } => + next_logup_singles_layer::(denominators), } } @@ -50,9 +56,6 @@ impl GkrOps for CudaBackend { h: &stwo_prover::core::lookups::gkr_prover::GkrMultivariatePolyOracle<'_, Self>, claim: SecureField, ) -> stwo_prover::core::lookups::utils::UnivariatePoly { - // 1. Create each layer sum check - // 2. Polynomial interpolation check - let n_variables = h.n_variables(); assert!(!n_variables.is_zero()); let n_terms = 1 << (n_variables - 1); @@ -62,20 +65,17 @@ impl GkrOps for CudaBackend { let lambda = h.lambda; let (mut eval_at_0, mut eval_at_2) = match &h.input_layer { - Layer::GrandProduct(col) => eval_grand_product_sum(eq_evals, col, n_terms), - Layer::LogUpGeneric { - numerators, - denominators, - } => eval_logup_generic_sum(eq_evals, numerators, denominators, n_terms, lambda), - Layer::LogUpMultiplicities { - numerators, - denominators, - } => eval_logup_multiplicities_sum(eq_evals, numerators, denominators, n_terms, lambda), - Layer::LogUpSingles { denominators } => { + Layer::GrandProduct(col) => + eval_grand_product_sum(eq_evals, col, n_terms), + Layer::LogUpGeneric {numerators, denominators} => + eval_logup_generic_sum(eq_evals, numerators, denominators, n_terms, lambda), + Layer::LogUpMultiplicities {numerators, denominators} => + eval_logup_multiplicities_sum(eq_evals, numerators, denominators, n_terms, lambda), + Layer::LogUpSingles { denominators } => eval_logup_singles_sum(eq_evals, denominators, n_terms, lambda) - } }; + // TODO(Daniel): Create polynomial interpolation kernel eval_at_0 *= h.eq_fixed_var_correction; eval_at_2 *= h.eq_fixed_var_correction; correct_sum_as_poly_in_first_variable(eval_at_0, eval_at_2, claim, y, n_variables) @@ -87,8 +87,8 @@ fn eval_grand_product_sum( input_layer: &Mle, n_terms: usize, ) -> (SecureField, SecureField) { - let eval_at_0 = SecureFieldVec::new_uninitialized(1); - let eval_at_2 = SecureFieldVec::new_uninitialized(1); + let eval_at_0 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); + let eval_at_2 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); unsafe { bindings::eval_grand_product_sum( @@ -109,14 +109,8 @@ fn eval_logup_generic_sum( n_terms: usize, lambda: SecureField, ) -> (SecureField, SecureField) { - let eval_at_0 = SecureFieldVec::new_uninitialized(1); - let eval_at_2 = SecureFieldVec::new_uninitialized(1); - - // println!("n_terms: {}", n_terms); - // println!("eq_evals: {:?}", eq_evals); - // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); - // println!("numerators: {:?}", numerators.clone().to_cpu()); - // println!("denominators: {:?}", denominators.clone().to_cpu()); + let eval_at_0 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); + let eval_at_2 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); unsafe { bindings::eval_logup_generic_sum( @@ -128,7 +122,6 @@ fn eval_logup_generic_sum( eval_at_0.device_ptr as *const CudaSecureField, eval_at_2.device_ptr as *const CudaSecureField); }; - // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) } @@ -140,14 +133,8 @@ fn eval_logup_multiplicities_sum( n_terms: usize, lambda: SecureField, ) -> (SecureField, SecureField) { - let eval_at_0 = SecureFieldVec::new_uninitialized(1); - let eval_at_2 = SecureFieldVec::new_uninitialized(1); - - // println!("n_terms: {}", n_terms); - // println!("eq_evals: {:?}", eq_evals); - // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); - // println!("numerators: {:?}", numerators.clone().to_cpu()); - // println!("denominators: {:?}", denominators.clone().to_cpu()); + let eval_at_0 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); + let eval_at_2 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); unsafe { bindings::eval_logup_multiplicities_sum( @@ -159,7 +146,6 @@ fn eval_logup_multiplicities_sum( eval_at_0.device_ptr as *const CudaSecureField, eval_at_2.device_ptr as *const CudaSecureField); }; - // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) } @@ -170,14 +156,8 @@ fn eval_logup_singles_sum( n_terms: usize, lambda: SecureField, ) -> (SecureField, SecureField) { - let eval_at_0 = SecureFieldVec::new_uninitialized(1); - let eval_at_2 = SecureFieldVec::new_uninitialized(1); - - // println!("n_terms: {}", n_terms); - // println!("eq_evals: {:?}", eq_evals); - // println!("mle evals: {:?}", &*eq_evals.clone().to_cpu()); - // println!("numerators: {:?}", numerators.clone().to_cpu()); - // println!("denominators: {:?}", denominators.clone().to_cpu()); + let eval_at_0 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); + let eval_at_2 = SecureFieldVec::new_uninitialized(SINGLE_EVALUATION_SIZE); unsafe { bindings::eval_logup_singles_sum( @@ -188,7 +168,6 @@ fn eval_logup_singles_sum( eval_at_0.device_ptr as *const CudaSecureField, eval_at_2.device_ptr as *const CudaSecureField); }; - // println!("here: {:?}, {:?}\n", eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()); (eval_at_0.to_cpu()[0].clone(), eval_at_2.to_cpu()[0].clone()) } @@ -284,21 +263,22 @@ fn next_logup_singles_layer( mod tests { use std::iter::zip; + use itertools::Itertools; use num_traits::One; - use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; - use stwo_prover::core::fields::{ExtensionOf, Field}; - use crate::CudaBackend; + use rand::rngs::SmallRng; + use stwo_prover::core::backend::{Column, CpuBackend}; - use stwo_prover::core::fields::m31::{BaseField, M31}; - use stwo_prover::core::fields::qm31::SecureField; - + use stwo_prover::core::channel::{Blake2sChannel, Channel}; + use stwo_prover::core::fields::{ExtensionOf, Field}; + use stwo_prover::core::fields::{m31::{BaseField, M31}, qm31::SecureField}; use stwo_prover::core::lookups::gkr_prover::{prove_batch, GkrOps, Layer}; use stwo_prover::core::lookups::gkr_verifier::{partially_verify_batch, Gate, GkrArtifact, GkrError}; use stwo_prover::core::lookups::mle::{Mle, MleOps}; use stwo_prover::core::lookups::utils::Fraction; - use stwo_prover::core::channel::{Blake2sChannel, Channel}; + + use crate::CudaBackend; #[test] fn gen_eq_evals_matches_cpu() { @@ -317,7 +297,7 @@ mod tests { #[test] fn grand_product_works() { - const N: usize = 1 << 5; + const N: usize = 1 << 12; let values = Blake2sChannel::default().draw_felts(N); let product = values.iter().product(); @@ -342,7 +322,7 @@ mod tests { #[test] fn logup_with_generic_trace_works() { - const N: usize = 1 << 5; + const N: usize = 1 << 12; let mut rng = SmallRng::seed_from_u64(0); let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); @@ -383,7 +363,7 @@ mod tests { #[test] fn logup_with_multiplicities_trace_works() { - const N: usize = 1 << 5; + const N: usize = 1 << 12; let mut rng = SmallRng::seed_from_u64(0); let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); @@ -424,7 +404,7 @@ mod tests { #[test] fn logup_with_singles_trace_works() { - const N: usize = 1 << 5; + const N: usize = 1 << 12; let mut rng = SmallRng::seed_from_u64(0); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); let sum = denominator_values @@ -460,6 +440,7 @@ mod tests { ); } + // CPU evaluation helper function pub(crate) fn eval_at_point(input: &Mle, point: &[SecureField]) -> SecureField where F: Field, From e5ba2e1162d1b9cb5b96a0b5b45d5de61962d35b Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 14:24:38 -0800 Subject: [PATCH 07/11] Clean --- stwo_gpu_backend/src/lookups/gkr.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stwo_gpu_backend/src/lookups/gkr.rs b/stwo_gpu_backend/src/lookups/gkr.rs index 1c692c6..5a56d17 100644 --- a/stwo_gpu_backend/src/lookups/gkr.rs +++ b/stwo_gpu_backend/src/lookups/gkr.rs @@ -297,7 +297,7 @@ mod tests { #[test] fn grand_product_works() { - const N: usize = 1 << 12; + const N: usize = 1 << 18; let values = Blake2sChannel::default().draw_felts(N); let product = values.iter().product(); @@ -322,7 +322,7 @@ mod tests { #[test] fn logup_with_generic_trace_works() { - const N: usize = 1 << 12; + const N: usize = 1 << 18; let mut rng = SmallRng::seed_from_u64(0); let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); @@ -363,7 +363,7 @@ mod tests { #[test] fn logup_with_multiplicities_trace_works() { - const N: usize = 1 << 12; + const N: usize = 1 << 18; let mut rng = SmallRng::seed_from_u64(0); let numerator_values = (0..N).map(|_| rng.gen()).collect::>(); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); @@ -404,7 +404,7 @@ mod tests { #[test] fn logup_with_singles_trace_works() { - const N: usize = 1 << 12; + const N: usize = 1 << 18; let mut rng = SmallRng::seed_from_u64(0); let denominator_values = (0..N).map(|_| rng.gen()).collect::>(); let sum = denominator_values From ea556f7424560c92a53b1b00e42f5af41dc72fdf Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 14:30:29 -0800 Subject: [PATCH 08/11] Revert mle testing --- stwo_gpu_backend/src/lookups/mle.rs | 31 +---------------------------- 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/stwo_gpu_backend/src/lookups/mle.rs b/stwo_gpu_backend/src/lookups/mle.rs index 42f129f..86453a6 100644 --- a/stwo_gpu_backend/src/lookups/mle.rs +++ b/stwo_gpu_backend/src/lookups/mle.rs @@ -49,7 +49,6 @@ impl MleOps for CudaBackend { result_evals.device_ptr, ) } - Mle::new(result_evals) } } @@ -78,7 +77,7 @@ mod tests { let mle_cpu = Mle::::new(values); let random_assignment = SecureField::from_u32_unchecked(7, 12, 3, 2); let mle_fixed_cpu = MleOps::::fix_first_variable(mle_cpu, random_assignment); - + let mle_fixed_simd = MleOps::::fix_first_variable(mle_cuda, random_assignment); assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) @@ -104,32 +103,4 @@ mod tests { assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) } - - - #[test] - fn fix_first_variable_with_secure_field_mle_matches_cpu_test() { - const N_VARIABLES: u32 = 8; - - let values = (0..(1 << N_VARIABLES) * 4) - .collect_vec() - .chunks(4) - .map(|a| SecureField::from_u32_unchecked(a[0], a[1], a[2], a[3])) - .collect_vec(); - let values = vec![ - SecureField::from_u32_unchecked(582331529, 613033660, 368449590, 1538560393), - SecureField::from_u32_unchecked(251659788, 78954062, 1801023691, 998786282), - SecureField::from_u32_unchecked(1184369939, 1548123522, 496550933, 2088405710), - SecureField::from_u32_unchecked(1158835792, 413758273, 961347360, 1684210072), - ]; - - let mle_cuda = - Mle::::new(SecureFieldVec::from_vec(values.clone())); - let mle_cpu = Mle::::new(values); - let random_assignment = SecureField::from_u32_unchecked(7, 12, 3, 2); - let mle_fixed_cpu = MleOps::::fix_first_variable(mle_cpu, random_assignment); - - let mle_fixed_simd = MleOps::::fix_first_variable(mle_cuda, random_assignment); - - assert_eq!(mle_fixed_simd.into_evals().to_cpu(), *mle_fixed_cpu) - } } From e0bf87bc0d1a8cbf6d77603e7752a5780ce6ecb8 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 14:31:00 -0800 Subject: [PATCH 09/11] Revert mle testing --- stwo_gpu_backend/src/lookups/mle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stwo_gpu_backend/src/lookups/mle.rs b/stwo_gpu_backend/src/lookups/mle.rs index 86453a6..8b2ea1a 100644 --- a/stwo_gpu_backend/src/lookups/mle.rs +++ b/stwo_gpu_backend/src/lookups/mle.rs @@ -16,7 +16,7 @@ impl MleOps for CudaBackend { ) -> Mle where Self: MleOps, - { + { let evals_size = mle.len(); let result_evals = SecureFieldVec::new_uninitialized(evals_size >> 1); unsafe { From 4b4526318bc66276908c2daf879b4b46efceecf3 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 14:31:16 -0800 Subject: [PATCH 10/11] Clean --- stwo_gpu_backend/src/examples/wide_fibonacci.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stwo_gpu_backend/src/examples/wide_fibonacci.rs b/stwo_gpu_backend/src/examples/wide_fibonacci.rs index b718a01..3941eac 100644 --- a/stwo_gpu_backend/src/examples/wide_fibonacci.rs +++ b/stwo_gpu_backend/src/examples/wide_fibonacci.rs @@ -200,7 +200,7 @@ mod test { } } - //#[test_log::test] + #[test_log::test] fn test_cuda_constraints_for_wide_fib_prove() { // Note: To see time measurement, run test with // RUST_LOG_SPAN_EVENTS=enter,close RUST_LOG=info RUST_BACKTRACE=1 From 1e9287989ec90bf86093e422a047a9a79a5c5dc4 Mon Sep 17 00:00:00 2001 From: danielntmd Date: Tue, 5 Nov 2024 14:32:49 -0800 Subject: [PATCH 11/11] Added: cuda files to commit --- cuda/src/gkr.cu | 427 ++++++++++++++++++++++++++---------------------- 1 file changed, 235 insertions(+), 192 deletions(-) diff --git a/cuda/src/gkr.cu b/cuda/src/gkr.cu index 17d95ee..88b98a1 100644 --- a/cuda/src/gkr.cu +++ b/cuda/src/gkr.cu @@ -55,6 +55,30 @@ __device__ Fraction add_reciprocal(Reciprocal lhs, Reciprocal return Fraction(add(lhs.x, rhs.x), mul(lhs.x, rhs.x)); } +// Function performs a tree-style reduction for efficient value aggregation +// Size of output is the number of blocks +__global__ void reduction_kernel(qm31 *input, uint32_t input_size, qm31 *output) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + + __shared__ qm31 shared_eval[1024]; + if (tid < input_size) { + shared_eval[threadIdx.x] = input[tid]; + } + else { + shared_eval[threadIdx.x] = {0, 0, 0, 0}; + } + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + shared_eval[threadIdx.x] = add(shared_eval[threadIdx.x], shared_eval[threadIdx.x + s]); + } + __syncthreads(); + } + + if (threadIdx.x == 0) output[blockIdx.x] = shared_eval[0]; +} + __global__ void next_grand_product_layer_kernel(qm31 *layer, uint32_t layer_size, qm31 *next_layer, uint32_t next_layer_size) { unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < next_layer_size) { @@ -69,7 +93,106 @@ void next_grand_product_layer(qm31 *layer, uint32_t layer_size, qm31 *next_layer cudaDeviceSynchronize(); } -// optimize(daniel): use uint4 built in? +__global__ void next_logup_generic_layer_kernel( + qm31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); + Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); + + Fraction res = add_fraction(a, b); + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +void next_logup_generic_layer( + qm31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_generic_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + +__global__ void next_logup_multiplicities_layer_kernel( + m31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); + Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); + + Fraction res = add_fraction(a, b); + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +void next_logup_multiplicities_layer( + m31 *numerators, + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_multiplicities_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + +__global__ void next_logup_singles_layer_kernel( + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < next_size) { + Reciprocal even = Reciprocal(denominators[tid * 2]); + Reciprocal odd = Reciprocal(denominators[tid * 2 + 1]); + Fraction res = add_reciprocal(even, odd); + + next_numerators[tid] = res.numerator; + next_denominators[tid] = res.denominator; + } +} + +void next_logup_singles_layer( + qm31 *denominators, + uint32_t size, + qm31 *next_numerators, + qm31 *next_denominators, + uint32_t next_size +) { + const unsigned int BLOCK_SIZE = 1024; + const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + + next_logup_singles_layer_kernel<<>>(denominators, size, next_numerators, next_denominators, next_size); + cudaDeviceSynchronize(); +} + __global__ void eval_grand_product_sum_kernel( qm31 *eq_evals, qm31 *input_layer, @@ -131,8 +254,6 @@ void eval_grand_product_sum( const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; // Arrays for intra-block reduction - qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); - qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); qm31 *eval_at_0_temp_d; qm31 *eval_at_2_temp_d; @@ -147,59 +268,46 @@ void eval_grand_product_sum( eval_at_0_temp_d, eval_at_2_temp_d ); - cudaDeviceSynchronize(); - // Post intra-block reduction - // todo(daniel): move to kernel - cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - - for (int i = 1; i < NUM_BLOCKS; ++i) { - eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); - eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "eval_grand_product_sum_kernel launch error: %s\n", cudaGetErrorString(err)); } - cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); - cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); - - free(eval_at_0_temp_h); - free(eval_at_2_temp_h); - cudaFree(eval_at_0_temp_d); - cudaFree(eval_at_2_temp_d); -} - -__global__ void next_logup_generic_layer_kernel( - qm31 *numerators, - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < next_size) { - Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); - Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); + // Synchronize to catch any runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "eval_grand_product_sum_kernel execution error: %s\n", cudaGetErrorString(err)); + } - Fraction res = add_fraction(a, b); - next_numerators[tid] = res.numerator; - next_denominators[tid] = res.denominator; + // Post intra-block reduction + const int BLOCK_REDUCTION_SIZE = 1024; + unsigned int input_size = NUM_BLOCKS; + while (input_size > 1) { + unsigned int num_blocks = (input_size + BLOCK_REDUCTION_SIZE - 1) / BLOCK_REDUCTION_SIZE; + + // eval 0 + qm31 *reduction_output_0; + cudaMalloc((void **)&reduction_output_0, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_0_temp_d, input_size, reduction_output_0); + cudaFree(eval_at_0_temp_d); + eval_at_0_temp_d = reduction_output_0; + + // eval 2 + qm31 *reduction_output_2; + cudaMalloc((void **)&reduction_output_2, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_2_temp_d, input_size, reduction_output_2); + cudaFree(eval_at_2_temp_d); + eval_at_2_temp_d = reduction_output_2; + + input_size = num_blocks; } -} -void next_logup_generic_layer( - qm31 *numerators, - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - const unsigned int BLOCK_SIZE = 1024; - const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; + cudaMemcpy(eval_at_0, eval_at_0_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); - next_logup_generic_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); - cudaDeviceSynchronize(); + cudaFree(eval_at_0_temp_d); + cudaFree(eval_at_2_temp_d); } __global__ void eval_logup_generic_sum_kernel( @@ -241,9 +349,6 @@ __global__ void eval_logup_generic_sum_kernel( shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); - - // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; - // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; } __syncthreads(); @@ -277,8 +382,6 @@ void eval_logup_generic_sum( const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; // Arrays for intra-block reduction - qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); - qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); qm31 *eval_at_0_temp_d; qm31 *eval_at_2_temp_d; @@ -294,38 +397,44 @@ void eval_logup_generic_sum( eval_at_0_temp_d, eval_at_2_temp_d ); - // printf("ASDA\n\n\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - printf("eval_logup_generic_sum_kernel launch error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_generic_sum_kernel launch error: %s\n", cudaGetErrorString(err)); } // Synchronize to catch any runtime errors err = cudaDeviceSynchronize(); if (err != cudaSuccess) { - printf("eval_logup_generic_sum_kernel execution error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_generic_sum_kernel execution error: %s\n", cudaGetErrorString(err)); } - // Post intra-block reduction - cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - - // for (int i = 1; i < NUM_BLOCKS; ++i) { - // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); - // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); - // } - - // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); - // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); - // printf("ASDA\n\n\n"); + // Post intra-block reduction + const int BLOCK_REDUCTION_SIZE = 1024; + unsigned int input_size = NUM_BLOCKS; + while (input_size > 1) { + unsigned int num_blocks = (input_size + BLOCK_REDUCTION_SIZE - 1) / BLOCK_REDUCTION_SIZE; + + // eval 0 + qm31 *reduction_output_0; + cudaMalloc((void **)&reduction_output_0, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_0_temp_d, input_size, reduction_output_0); + cudaFree(eval_at_0_temp_d); + eval_at_0_temp_d = reduction_output_0; + + // eval 2 + qm31 *reduction_output_2; + cudaMalloc((void **)&reduction_output_2, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_2_temp_d, input_size, reduction_output_2); + cudaFree(eval_at_2_temp_d); + eval_at_2_temp_d = reduction_output_2; + + input_size = num_blocks; + } - - cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); - cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + cudaMemcpy(eval_at_0, eval_at_0_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); - free(eval_at_0_temp_h); - free(eval_at_2_temp_h); cudaFree(eval_at_0_temp_d); cudaFree(eval_at_2_temp_d); } @@ -369,9 +478,6 @@ __global__ void eval_logup_multiplicities_sum_kernel( shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); - - // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; - // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; } __syncthreads(); @@ -405,8 +511,6 @@ void eval_logup_multiplicities_sum( const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; // Arrays for intra-block reduction - qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); - qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); qm31 *eval_at_0_temp_d; qm31 *eval_at_2_temp_d; @@ -422,38 +526,43 @@ void eval_logup_multiplicities_sum( eval_at_0_temp_d, eval_at_2_temp_d ); - // printf("ASDA\n\n\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - printf("eval_logup_multiplicities_sum launch error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_multiplicities_sum_kernel launch error: %s\n", cudaGetErrorString(err)); } // Synchronize to catch any runtime errors err = cudaDeviceSynchronize(); if (err != cudaSuccess) { - printf("eval_logup_multiplicities_sum execution error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_multiplicities_sum_kernel execution error: %s\n", cudaGetErrorString(err)); } - // Post intra-block reduction - cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - - // for (int i = 1; i < NUM_BLOCKS; ++i) { - // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); - // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); - // } - - // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); - // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); - // printf("ASDA\n\n\n"); - - - cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); - cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + // Post intra-block reduction + const int BLOCK_REDUCTION_SIZE = 1024; + unsigned int input_size = NUM_BLOCKS; + while (input_size > 1) { + unsigned int num_blocks = (input_size + BLOCK_REDUCTION_SIZE - 1) / BLOCK_REDUCTION_SIZE; + + // eval 0 + qm31 *reduction_output_0; + cudaMalloc((void **)&reduction_output_0, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_0_temp_d, input_size, reduction_output_0); + cudaFree(eval_at_0_temp_d); + eval_at_0_temp_d = reduction_output_0; + + // eval 2 + qm31 *reduction_output_2; + cudaMalloc((void **)&reduction_output_2, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_2_temp_d, input_size, reduction_output_2); + cudaFree(eval_at_2_temp_d); + eval_at_2_temp_d = reduction_output_2; + + input_size = num_blocks; + } - free(eval_at_0_temp_h); - free(eval_at_2_temp_h); + cudaMemcpy(eval_at_0, eval_at_0_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); cudaFree(eval_at_0_temp_d); cudaFree(eval_at_2_temp_d); } @@ -490,9 +599,6 @@ __global__ void eval_logup_singles_sum_kernel( shared_eval_0[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_0.numerator, mul(lambda, fraction_eval_0.denominator))); shared_eval_2[threadIdx.x] = mul(eq_evals[tid], add(fraction_eval_2.numerator, mul(lambda, fraction_eval_2.denominator))); - - // shared_eval_0[threadIdx.x] = inp_numer_at_r0i1; - // shared_eval_2[threadIdx.x] = inp_denom_at_r0i1; } __syncthreads(); @@ -525,8 +631,6 @@ void eval_logup_singles_sum( const unsigned int NUM_BLOCKS = (n_terms + BLOCK_SIZE - 1) / BLOCK_SIZE; // Arrays for intra-block reduction - qm31 *eval_at_0_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); - qm31 *eval_at_2_temp_h = (qm31 *)malloc(sizeof(qm31) * NUM_BLOCKS); qm31 *eval_at_0_temp_d; qm31 *eval_at_2_temp_d; @@ -541,105 +645,44 @@ void eval_logup_singles_sum( eval_at_0_temp_d, eval_at_2_temp_d ); - // printf("ASDA\n\n\n"); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - printf("eval_logup_singles_sum launch error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_singles_sum_kernel launch error: %s\n", cudaGetErrorString(err)); } // Synchronize to catch any runtime errors err = cudaDeviceSynchronize(); if (err != cudaSuccess) { - printf("eval_logup_singles_sum execution error: %s\n", cudaGetErrorString(err)); + fprintf(stderr, "eval_logup_singles_sum_kernel execution error: %s\n", cudaGetErrorString(err)); } - // Post intra-block reduction - cudaMemcpy(eval_at_0_temp_h, eval_at_0_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - cudaMemcpy(eval_at_2_temp_h, eval_at_2_temp_d, sizeof(qm31) * NUM_BLOCKS, cudaMemcpyDeviceToHost); - - // for (int i = 1; i < NUM_BLOCKS; ++i) { - // eval_at_0_temp_h[0] = add(eval_at_0_temp_h[0], eval_at_0_temp_h[i]); - // eval_at_2_temp_h[0] = add(eval_at_2_temp_h[0], eval_at_2_temp_h[i]); - // } - - // printf("num: %u, %u, %u, %u", eval_at_0_temp_h[0].a.a, eval_at_0_temp_h[0].a.b, eval_at_0_temp_h[0].b.a, eval_at_0_temp_h[0].b.b); - // printf("den: %u, %u, %u, %u", eval_at_2_temp_h[0].a.a, eval_at_2_temp_h[0].a.b, eval_at_2_temp_h[0].b.a, eval_at_2_temp_h[0].b.b); - // printf("ASDA\n\n\n"); - - - cudaMemcpy(eval_at_0, eval_at_0_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); - cudaMemcpy(eval_at_2, eval_at_2_temp_h, sizeof(qm31), cudaMemcpyHostToDevice); + // Post intra-block reduction + const int BLOCK_REDUCTION_SIZE = 1024; + unsigned int input_size = NUM_BLOCKS; + while (input_size > 1) { + unsigned int num_blocks = (input_size + BLOCK_REDUCTION_SIZE - 1) / BLOCK_REDUCTION_SIZE; + + // eval 0 + qm31 *reduction_output_0; + cudaMalloc((void **)&reduction_output_0, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_0_temp_d, input_size, reduction_output_0); + cudaFree(eval_at_0_temp_d); + eval_at_0_temp_d = reduction_output_0; + + // eval 2 + qm31 *reduction_output_2; + cudaMalloc((void **)&reduction_output_2, sizeof(qm31) * num_blocks); + reduction_kernel<<>>(eval_at_2_temp_d, input_size, reduction_output_2); + cudaFree(eval_at_2_temp_d); + eval_at_2_temp_d = reduction_output_2; + + input_size = num_blocks; + } - free(eval_at_0_temp_h); - free(eval_at_2_temp_h); + cudaMemcpy(eval_at_0, eval_at_0_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); + cudaMemcpy(eval_at_2, eval_at_2_temp_d, sizeof(qm31), cudaMemcpyDeviceToDevice); cudaFree(eval_at_0_temp_d); cudaFree(eval_at_2_temp_d); } -__global__ void next_logup_multiplicities_layer_kernel( - m31 *numerators, - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < next_size) { - Fraction a = Fraction(numerators[tid * 2], denominators[tid * 2]); - Fraction b = Fraction(numerators[tid * 2 + 1], denominators[tid * 2 + 1]); - - Fraction res = add_fraction(a, b); - next_numerators[tid] = res.numerator; - next_denominators[tid] = res.denominator; - } -} - -__global__ void next_logup_singles_layer_kernel( - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < next_size) { - Reciprocal even = Reciprocal(denominators[tid * 2]); - Reciprocal odd = Reciprocal(denominators[tid * 2 + 1]); - Fraction res = add_reciprocal(even, odd); - - next_numerators[tid] = res.numerator; - next_denominators[tid] = res.denominator; - } -} - -void next_logup_multiplicities_layer( - m31 *numerators, - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - const unsigned int BLOCK_SIZE = 1024; - const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - - next_logup_multiplicities_layer_kernel<<>>(numerators, denominators, size, next_numerators, next_denominators, next_size); - cudaDeviceSynchronize(); -} - -void next_logup_singles_layer( - qm31 *denominators, - uint32_t size, - qm31 *next_numerators, - qm31 *next_denominators, - uint32_t next_size -) { - const unsigned int BLOCK_SIZE = 1024; - const unsigned int NUM_BLOCKS = (next_size + BLOCK_SIZE - 1) / BLOCK_SIZE; - - next_logup_singles_layer_kernel<<>>(denominators, size, next_numerators, next_denominators, next_size); - cudaDeviceSynchronize(); -} -