diff --git a/cuda/include/poly/eval_at_point.cuh b/cuda/include/poly/eval_at_point.cuh index bf5a202..8a0b9ce 100644 --- a/cuda/include/poly/eval_at_point.cuh +++ b/cuda/include/poly/eval_at_point.cuh @@ -2,8 +2,15 @@ #define POLY_EVAL_AT_POINT_H #include "../fields.cuh" +#include "../utils.cuh" extern "C" qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y); +extern "C" +void evaluate_polynomials_out_of_domain( + qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials, + qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes +); + #endif // POLY_EVAL_AT_POINT_H \ No newline at end of file diff --git a/cuda/src/poly/eval_at_point.cu b/cuda/src/poly/eval_at_point.cu index 31faf15..96bbab6 100644 --- a/cuda/src/poly/eval_at_point.cu +++ b/cuda/src/poly/eval_at_point.cu @@ -48,7 +48,6 @@ __global__ void eval_at_point_first_pass(m31 *g_coeffs, qm31 *temp, qm31 *factor } factor_idx -= 1; level_size >>= 1; - } if (idx == 0) { @@ -154,3 +153,211 @@ qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y) { return result; } +/* Many polynomials */ + + +__global__ void eval_many_at_point_first_pass(m31 **g_coeffs, qm31 *temp, qm31 *factors, int coeffs_size, int factors_size, + int output_offset) { + int idx = threadIdx.x; + + qm31 *output = &temp[output_offset]; + + int coeffs_per_block = 2 * blockDim.x; + int blocks_in_poly = max(1, coeffs_size / coeffs_per_block); + // Thread syncing happens within a block. + // Split the problem to feed them to multiple blocks. + if (coeffs_size >= coeffs_per_block) { + coeffs_size = coeffs_per_block; + } + + extern __shared__ m31 s_coeffs[]; + extern __shared__ qm31 s_level[]; + + int poly_index = blockIdx.x / blocks_in_poly; + + // A % X == A & (X-1) when X is a power of two + s_coeffs[idx] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx]; + s_coeffs[idx + blockDim.x] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx + blockDim.x]; + __syncthreads(); + + int level_size = coeffs_size >> 1; + int factor_idx = factors_size - 1; + + if (idx < level_size) { + m31 alpha = s_coeffs[2 * idx]; + m31 beta = s_coeffs[2 * idx + 1]; + qm31 factor = factors[factor_idx]; + + qm31 result = { + {add(mul(beta, factor.a.a), alpha), mul(factor.a.b, beta)}, + {mul(beta, factor.b.a), mul(beta, factor.b.b)} + }; + s_level[idx] = result; + } + factor_idx -= 1; + level_size >>= 1; + + while (level_size > 0) { + if (idx < level_size) { + __syncthreads(); + qm31 a = s_level[2 * idx]; + qm31 b = s_level[2 * idx + 1]; + __syncthreads(); + s_level[idx] = add(a, mul(b, factors[factor_idx])); + } + factor_idx -= 1; + level_size >>= 1; + } + + if (idx == 0) { + output[blockIdx.x] = s_level[0]; + } +} + +__global__ +void eval_many_at_point_second_pass(qm31 *temp, qm31 *factors, int level_size, int factor_offset, int level_offset, + int output_offset, int results_per_block) { + int idx = threadIdx.x; + + qm31 *level = &temp[level_offset]; + qm31 *output = &temp[output_offset]; + + // Thread syncing happens within a block. + // Split the problem to feed them to multiple blocks. + if (level_size >= 2 * blockDim.x) { + level_size = 2 * blockDim.x; + } + + extern __shared__ qm31 s_level[]; + + s_level[idx] = level[2 * blockIdx.x * blockDim.x + idx]; + s_level[idx + blockDim.x] = level[2 * blockIdx.x * blockDim.x + idx + blockDim.x]; + + level_size >>= 1; + + int factor_idx = factor_offset; + + while (level_size >= results_per_block) { + if (idx < level_size) { + __syncthreads(); + qm31 a = s_level[2 * idx]; + qm31 b = s_level[2 * idx + 1]; + __syncthreads(); + s_level[idx] = add(a, mul(b, factors[factor_idx])); + } + factor_idx -= 1; + level_size >>= 1; + } + + if (idx < results_per_block) { + output[blockIdx.x * results_per_block + idx] = s_level[idx]; + } +} + +__global__ +void copy_result_for_polynomial(qm31 **result, qm31 *temp, int number_of_polynomials) { + int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (global_thread_index < number_of_polynomials) { + result[global_thread_index][0] = temp[global_thread_index]; + } +} + +void eval_polys_at_point( + qm31 **result, m31 **polynomials, int log_number_of_polynomials, int log_coeffs_size, qm31 point_x, qm31 point_y +) { + int coeffs_size = 1 << log_coeffs_size; + int block_dim = min(256, coeffs_size); + int coeffs_per_block = block_dim * 2; + + qm31 *host_mappings = (qm31 *) malloc(sizeof(qm31) * log_coeffs_size); + host_mappings[log_coeffs_size - 1] = point_y; + host_mappings[log_coeffs_size - 2] = point_x; + qm31 x = point_x; + for (int i = 2; i < log_coeffs_size; i += 1) { + x = sub(mul(qm31{cm31{2, 0}, cm31{0, 0}}, mul(x, x)), qm31{cm31{1, 0}, cm31{0, 0}}); + host_mappings[log_coeffs_size - 1 - i] = x; + } + + int number_of_polynomials = 1 << log_number_of_polynomials; + int total_number_of_coeffs = coeffs_size * number_of_polynomials; + int temp_memory_size = 0; + int size = total_number_of_coeffs; + while (size > number_of_polynomials) { + size = (size + coeffs_per_block - 1) / coeffs_per_block; + temp_memory_size += size; + } + + temp_memory_size = max(temp_memory_size, number_of_polynomials); + + qm31 *temp = cuda_malloc(temp_memory_size); + qm31 *device_mappings = clone_to_device(host_mappings, log_coeffs_size); + + free(host_mappings); + + // First pass + int num_blocks = max(number_of_polynomials, ((total_number_of_coeffs >> 1) + block_dim - 1) / block_dim); + int shared_memory_bytes = coeffs_per_block * 4 + coeffs_per_block * 8; + int output_offset = temp_memory_size - num_blocks; + + eval_many_at_point_first_pass<<>>(polynomials, temp, device_mappings, coeffs_size, + log_coeffs_size, output_offset); + + // Second pass + int mappings_offset = log_coeffs_size - 1; + int level_offset = output_offset; + while (num_blocks > number_of_polynomials) { + mappings_offset -= 9; + int new_num_blocks = ((num_blocks >> 1) + block_dim - 1) / block_dim; + int number_of_results = max(number_of_polynomials, new_num_blocks); + int results_per_block = number_of_results / new_num_blocks; + shared_memory_bytes = coeffs_per_block * 4 * 4; + output_offset = level_offset - new_num_blocks; + eval_many_at_point_second_pass<<>>(temp, device_mappings, num_blocks, + mappings_offset, level_offset, + output_offset, results_per_block); + num_blocks = new_num_blocks; + level_offset = output_offset; + } + + cudaDeviceSynchronize(); + + num_blocks = (number_of_polynomials + block_dim - 1) / block_dim; + copy_result_for_polynomial<<>>( + result, temp, number_of_polynomials + ); + + cuda_free_memory(temp); + cuda_free_memory(device_mappings); +} + +void eval_polys_at_points( + qm31 **result, m31 **polynomials, int log_polynomial_size, int log_number_of_polynomials, + qm31 *points_x, qm31 *points_y, int sample_size +) { + for (int point_index = 0; point_index < sample_size; point_index++) { + qm31 point_x = points_x[point_index]; + qm31 point_y = points_y[point_index]; + eval_polys_at_point(result, polynomials, log_number_of_polynomials, log_polynomial_size, point_x, point_y); + } +} + +void evaluate_polynomials_out_of_domain( + qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials, + qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes +) { + // In this iteration, we assume all polynomials are of equal size and will be evaluated in the same list of points + + qm31 **device_result = clone_to_device(result, number_of_polynomials); + m31 **device_polynomials = clone_to_device(polynomials, number_of_polynomials); + int log_polynomial_size = log_polynomial_sizes[0]; + int log_number_of_polynomials = log_2(number_of_polynomials); + + eval_polys_at_points( + device_result, device_polynomials, log_polynomial_size, log_number_of_polynomials, + out_of_domain_points_x[0], out_of_domain_points_y[0], sample_sizes[0] + ); + + cuda_free_memory(device_result); + cuda_free_memory(device_polynomials); +}; diff --git a/stwo_gpu_backend/Cargo.toml b/stwo_gpu_backend/Cargo.toml index 89e80e3..6800805 100644 --- a/stwo_gpu_backend/Cargo.toml +++ b/stwo_gpu_backend/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] cc = "1.0" -stwo-prover = { git = "https://github.com/starkware-libs/stwo", rev = "ce5b975" } +stwo-prover = { git = "https://github.com/jarnesino/stwo", rev = "fa16181" } itertools = "0.10.5" rand = "0.8.5" criterion = "0.4" @@ -35,5 +35,9 @@ name = "interpolate_columns" harness = false [[bench]] -name = "evaluate_columns" +name = "evaluate_polynomials" +harness = false + +[[bench]] +name = "evaluate_polynomials_out_of_domain" harness = false diff --git a/stwo_gpu_backend/benches/evaluate_columns.rs b/stwo_gpu_backend/benches/evaluate_polynomials.rs similarity index 84% rename from stwo_gpu_backend/benches/evaluate_columns.rs rename to stwo_gpu_backend/benches/evaluate_polynomials.rs index d63980f..d540a95 100644 --- a/stwo_gpu_backend/benches/evaluate_columns.rs +++ b/stwo_gpu_backend/benches/evaluate_polynomials.rs @@ -8,12 +8,12 @@ use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps}; use stwo_prover::core::poly::BitReversedOrder; use stwo_prover::core::ColumnVec; -const LOG_COLUMN_SIZE: u32 = 10; -const LOG_NUMBER_OF_COLUMNS: usize = 16; +const LOG_COLUMN_SIZE: u32 = 16; +const LOG_NUMBER_OF_COLUMNS: usize = 10; const LOG_BLOWUP_FACTOR: u32 = 2; -pub fn simd_evaluate_columns(c: &mut Criterion) { - let mut group = c.benchmark_group("evaluate_columns"); +pub fn simd_evaluate_polynomials(c: &mut Criterion) { + let mut group = c.benchmark_group("evaluate_polynomials"); let coset = CanonicCoset::new(LOG_COLUMN_SIZE); let values = (0..coset.size()).map(BaseField::from).collect(); @@ -37,8 +37,8 @@ pub fn simd_evaluate_columns(c: &mut Criterion) { }); } -pub fn gpu_evaluate_columns(c: &mut Criterion) { - let mut group = c.benchmark_group("evaluate_columns"); +pub fn gpu_evaluate_polynomials(c: &mut Criterion) { + let mut group = c.benchmark_group("evaluate_polynomials"); let coset = CanonicCoset::new(LOG_COLUMN_SIZE); let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec()); @@ -63,7 +63,7 @@ pub fn gpu_evaluate_columns(c: &mut Criterion) { } criterion_group!( - name = interpolate_columns; + name = evaluate_polynomials; config = Criterion::default().sample_size(10); - targets = simd_evaluate_columns, gpu_evaluate_columns); -criterion_main!(interpolate_columns); + targets = simd_evaluate_polynomials, gpu_evaluate_polynomials); +criterion_main!(evaluate_polynomials); diff --git a/stwo_gpu_backend/benches/evaluate_polynomials_out_of_domain.rs b/stwo_gpu_backend/benches/evaluate_polynomials_out_of_domain.rs new file mode 100644 index 0000000..0b3b566 --- /dev/null +++ b/stwo_gpu_backend/benches/evaluate_polynomials_out_of_domain.rs @@ -0,0 +1,100 @@ +use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; +use itertools::Itertools; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; +use stwo_gpu_backend::cuda::BaseFieldVec; +use stwo_gpu_backend::CudaBackend; +use stwo_prover::core::air::mask::fixed_mask_points; +use stwo_prover::core::backend::simd::SimdBackend; +use stwo_prover::core::backend::{Backend, ColumnOps}; +use stwo_prover::core::circle::CirclePoint; +use stwo_prover::core::fields::m31::BaseField; +use stwo_prover::core::fields::qm31::SecureField; +use stwo_prover::core::pcs::TreeVec; +use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation}; +use stwo_prover::core::poly::BitReversedOrder; +use stwo_prover::core::ColumnVec; + +const LOG_COLUMN_SIZE: u32 = 16; +const LOG_NUMBER_OF_COLUMNS: usize = 10; +const LOG_BLOWUP_FACTOR: u32 = 2; + +fn generate_random_point() -> CirclePoint { + let mut rng = SmallRng::seed_from_u64(0); + let x = rng.gen(); + let y = rng.gen(); + CirclePoint { x, y } +} + +fn mask_points( + point: CirclePoint, + number_of_columns: usize, +) -> TreeVec>>> { + TreeVec(vec![fixed_mask_points( + &vec![vec![0_usize]; number_of_columns], + point, + )]) +} + +fn benchmark_evaluate_polynomials_out_of_domain( + criterion: &mut Criterion, + coset: CanonicCoset, + values: >::Column, + benchmark_id: &str, +) { + let mut group = criterion.benchmark_group("evaluate_polynomials_out_of_domain"); + let number_of_columns = 1 << LOG_NUMBER_OF_COLUMNS; + + let circle_evaluation: CircleEvaluation = + B::new_canonical_ordered(coset, values); + + let interpolation_coset = CanonicCoset::new(LOG_COLUMN_SIZE + LOG_BLOWUP_FACTOR); + let twiddle_tree = B::precompute_twiddles(interpolation_coset.half_coset()); + + let polynomial = B::interpolate(circle_evaluation, &twiddle_tree); + let polynomials = ColumnVec::from(vec![&polynomial; number_of_columns]); + + let point = generate_random_point(); + let sample_points = mask_points(point, number_of_columns); + + group.bench_function(BenchmarkId::new(benchmark_id, LOG_COLUMN_SIZE), |b| { + b.iter_batched( + || (polynomials.clone(), sample_points.clone()), + |(polys, sample)| { + B::evaluate_polynomials_out_of_domain(TreeVec::new(vec![polys]), sample) + }, + BatchSize::LargeInput, + ) + }); +} + +pub fn simd_evaluate_polynomials_out_of_domain(c: &mut Criterion) { + let coset = CanonicCoset::new(LOG_COLUMN_SIZE); + let values = (0..coset.size()).map(BaseField::from).collect(); + + benchmark_evaluate_polynomials_out_of_domain::( + c, + coset, + values, + "simd evaluate polynomials out of domain", + ); +} + +pub fn gpu_evaluate_polynomials_out_of_domain(c: &mut Criterion) { + let coset = CanonicCoset::new(LOG_COLUMN_SIZE); + let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec()); + + benchmark_evaluate_polynomials_out_of_domain::( + c, + coset, + values, + "gpu evaluate polynomials out of domain", + ); +} + +criterion_group!( + name = evaluate_polynomials_out_of_domain; + config = Criterion::default().sample_size(10); + targets = simd_evaluate_polynomials_out_of_domain, gpu_evaluate_polynomials_out_of_domain +); +criterion_main!(evaluate_polynomials_out_of_domain); diff --git a/stwo_gpu_backend/benches/interpolate_columns.rs b/stwo_gpu_backend/benches/interpolate_columns.rs index 92a3685..9ec4577 100644 --- a/stwo_gpu_backend/benches/interpolate_columns.rs +++ b/stwo_gpu_backend/benches/interpolate_columns.rs @@ -7,8 +7,8 @@ use stwo_prover::core::fields::m31::BaseField; use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps}; use stwo_prover::core::poly::BitReversedOrder; -const LOG_COLUMN_SIZE: u32 = 10; -const LOG_NUMBER_OF_COLUMNS: usize = 16; +const LOG_COLUMN_SIZE: u32 = 16; +const LOG_NUMBER_OF_COLUMNS: usize = 10; pub fn simd_interpolate_columns(c: &mut Criterion) { let mut group = c.benchmark_group("interpolate_columns"); diff --git a/stwo_gpu_backend/src/cuda/bindings.rs b/stwo_gpu_backend/src/cuda/bindings.rs index 9d5b096..b6517e1 100644 --- a/stwo_gpu_backend/src/cuda/bindings.rs +++ b/stwo_gpu_backend/src/cuda/bindings.rs @@ -158,6 +158,16 @@ extern "C" { point_y: CudaSecureField, ) -> CudaSecureField; + pub fn evaluate_polynomials_out_of_domain( + result: *const *const u32, + polynomials: *const *const u32, + log_polynomial_sizes: *const u32, + number_of_polynomials: u32, + out_of_domain_points_x: *const *const u32, + out_of_domain_points_y: *const *const u32, + sample_sizes: *const u32, + ); + pub fn fold_line( gpu_domain: *const u32, twiddle_offset: usize, diff --git a/stwo_gpu_backend/src/examples/mod.rs b/stwo_gpu_backend/src/examples/mod.rs index 13283a4..016edda 100644 --- a/stwo_gpu_backend/src/examples/mod.rs +++ b/stwo_gpu_backend/src/examples/mod.rs @@ -83,9 +83,7 @@ mod tests { use crate::CudaBackend; use super::{generate_trace, FibInput, WideFibonacciEvalCuda}; - use stwo_prover::constraint_framework::{ - assert_constraints, AssertEvaluator, FrameworkEval, - }; + use stwo_prover::constraint_framework::{assert_constraints, AssertEvaluator, FrameworkEval}; use stwo_prover::core::backend::simd::m31::{PackedBaseField, LOG_N_LANES}; use stwo_prover::core::backend::Column; use stwo_prover::core::fields::m31::BaseField; diff --git a/stwo_gpu_backend/src/poly.rs b/stwo_gpu_backend/src/poly.rs index 72863bd..1d20cda 100644 --- a/stwo_gpu_backend/src/poly.rs +++ b/stwo_gpu_backend/src/poly.rs @@ -1,9 +1,12 @@ use crate::cuda::bindings::CudaSecureField; +use crate::cuda::SecureFieldVec; use crate::{ backend::CudaBackend, cuda::{self}, }; use itertools::Itertools; +use stwo_prover::core::pcs::quotients::PointSample; +use stwo_prover::core::pcs::TreeVec; use stwo_prover::core::{ backend::{Col, Column}, circle::{CirclePoint, Coset}, @@ -15,7 +18,6 @@ use stwo_prover::core::{ }, ColumnVec, }; -use tracing::{span, Level}; impl PolyOps for CudaBackend { type Twiddles = cuda::BaseFieldVec; @@ -93,6 +95,99 @@ impl PolyOps for CudaBackend { } } + fn evaluate_polynomials_out_of_domain( + polynomials: TreeVec>>, + points: TreeVec>>>, + ) -> TreeVec>> { + TreeVec::new( + polynomials + .0 + .into_iter() + .enumerate() + .map(|(index, column)| { + let log_polynomial_sizes: Vec = column + .iter() + .map(|polynomial| polynomial.log_size()) + .collect(); + + let polynomial_coefficients: Vec<*const u32> = column + .into_iter() + .map(|polynomial| polynomial.coeffs.device_ptr) + .collect(); + + let out_of_domain_points = &points.0[index]; + let points_x = out_of_domain_points + .iter() + .map(|points_x_y| points_x_y.iter().map(|point| point.x).collect_vec()) + .collect_vec(); + let points_y = out_of_domain_points + .iter() + .map(|points_x_y| points_x_y.iter().map(|point| point.y).collect_vec()) + .collect_vec(); + let points_x_pointers = points_x + .iter() + .map(|points_x_for_polynomial| { + points_x_for_polynomial.as_ptr() as *const u32 + }) + .collect_vec(); + let points_y_pointers = points_y + .iter() + .map(|points_y_for_polynomial| { + points_y_for_polynomial.as_ptr() as *const u32 + }) + .collect_vec(); + + let sample_sizes = out_of_domain_points + .iter() + .map(|points_x_y| points_x_y.len() as u32) + .collect_vec(); + + let evaluations: Vec = (0..polynomial_coefficients.len()) + .map(|index| { + SecureFieldVec::new_uninitialized(sample_sizes[index] as usize) + }) + .collect(); + let evaluation_pointers = evaluations + .iter() + .map(|evaluation_vector| evaluation_vector.device_ptr) + .collect_vec(); + + // In this iteration, we assume all polynomials are of equal size and will be evaluated in the same list of points + let points_for_first_poly = &out_of_domain_points[0]; + assert!(out_of_domain_points.iter().all(|points| points == points_for_first_poly)); + + unsafe { + cuda::bindings::evaluate_polynomials_out_of_domain( + evaluation_pointers.as_ptr(), + polynomial_coefficients.as_ptr(), + log_polynomial_sizes.as_ptr(), + polynomial_coefficients.len() as u32, + points_x_pointers.as_ptr(), + points_y_pointers.as_ptr(), + sample_sizes.as_ptr(), + ); + } + + evaluations + .into_iter() + .zip(out_of_domain_points) + .map(|(values, evaluated_points)| { + values + .to_vec() + .into_iter() + .zip(evaluated_points) + .map(|(value, point)| PointSample { + point: *point, + value, + }) + .collect() + }) + .collect() + }) + .collect(), + ) + } + fn extend(poly: &CirclePoly, log_size: u32) -> CirclePoly { let new_size = 1 << log_size; assert!( @@ -195,6 +290,11 @@ impl PolyOps for CudaBackend { #[cfg(test)] mod tests { use itertools::Itertools; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use stwo_prover::core::air::mask::fixed_mask_points; + use stwo_prover::core::fields::qm31::SecureField; + use stwo_prover::core::pcs::TreeVec; use stwo_prover::core::poly::circle::{ CanonicCoset, CircleDomain, CircleEvaluation, CirclePoly, PolyOps, }; @@ -205,7 +305,6 @@ mod tests { fields::m31::BaseField, ColumnVec, }; - use test_log::test; use crate::{ backend::CudaBackend, @@ -613,7 +712,7 @@ mod tests { assert_eq!(expected_result.coeffs, result.coeffs.to_cpu()); } - #[test_log::test] + #[test] fn test_interpolate_columns() { let log_size = 9; let log_number_of_columns = 8; @@ -653,8 +752,8 @@ mod tests { assert_eq!(coeffs, expected_coeffs); } - #[test_log::test] - fn test_evaluate_columns() { + #[test] + fn test_evaluate_polynomials() { let log_size = 9; let log_number_of_columns = 8; let log_blowup_factor = 2; @@ -689,7 +788,8 @@ mod tests { .collect_vec(), ); - let result = CudaBackend::evaluate_polynomials(&mut gpu_columns, log_blowup_factor, &gpu_twiddles); + let result = + CudaBackend::evaluate_polynomials(&mut gpu_columns, log_blowup_factor, &gpu_twiddles); let expected_result = CpuBackend::evaluate_polynomials(&mut cpu_columns, log_blowup_factor, &cpu_twiddles); @@ -704,4 +804,79 @@ mod tests { assert_eq!(values, expected_values); } + + fn generate_random_point() -> CirclePoint { + let mut rng = SmallRng::seed_from_u64(0); + let x = rng.gen(); + let y = rng.gen(); + CirclePoint { x, y } + } + + fn mask_points( + point: CirclePoint, + number_of_columns: usize, + ) -> TreeVec>>> { + TreeVec(vec![fixed_mask_points( + &vec![vec![0_usize]; number_of_columns], + point, + )]) + } + + #[test] + fn test_evaluate_polynomials_out_of_domain() { + for log_size in [3, 9, 14] { + let log_number_of_columns = 8; + let log_blowup_factor = 2; + + let size = 1 << log_size; + let number_of_columns = 1 << log_number_of_columns; + + let cpu_values = (1..(size + 1) as u32) + .map(BaseField::from) + .collect::>(); + let gpu_values = cuda::BaseFieldVec::from_vec(cpu_values.clone()); + + let trace_coset = CanonicCoset::new(log_size); + let cpu_evaluations = CpuBackend::new_canonical_ordered(trace_coset, cpu_values); + let gpu_evaluations = CudaBackend::new_canonical_ordered(trace_coset, gpu_values); + + let interpolation_coset = CanonicCoset::new(log_size + log_blowup_factor); + let cpu_twiddles = CpuBackend::precompute_twiddles(interpolation_coset.half_coset()); + let gpu_twiddles = CudaBackend::precompute_twiddles(interpolation_coset.half_coset()); + + let cpu_poly = CpuBackend::interpolate(cpu_evaluations, &cpu_twiddles); + let gpu_poly = CudaBackend::interpolate(gpu_evaluations, &gpu_twiddles); + + let cpu_polynomials = + ColumnVec::from((0..number_of_columns).map(|_index| &cpu_poly).collect_vec()); + let gpu_polynomials = + ColumnVec::from((0..number_of_columns).map(|_index| &gpu_poly).collect_vec()); + + let point = generate_random_point(); + let sample_points = mask_points(point, number_of_columns); + + let result = CudaBackend::evaluate_polynomials_out_of_domain( + TreeVec::new(vec![gpu_polynomials]), + sample_points.clone(), + ); + let expected_result = CpuBackend::evaluate_polynomials_out_of_domain( + TreeVec::new(vec![cpu_polynomials]), + sample_points.clone(), + ); + + let flattened_result = result.flatten_cols(); + let flattened_expected_result = expected_result.flatten_cols(); + + let values = flattened_result + .iter() + .map(|point_sample| (point_sample.point, point_sample.value)) + .collect_vec(); + let expected_values = flattened_expected_result + .iter() + .map(|point_sample| (point_sample.point, point_sample.value)) + .collect_vec(); + + assert_eq!(values, expected_values); + } + } }