From 1e1ea3433368794b31d1cfcec87c7f3889ecce2c Mon Sep 17 00:00:00 2001 From: DoHoonKim8 Date: Mon, 8 Jul 2024 17:26:25 +0000 Subject: [PATCH] Add kernel for evaluating multilinear polynomial by evaluation form (only works for single thread block) --- sumcheck/src/cuda/kernels/multilinear.cu | 47 ++++++++------- sumcheck/src/lib.rs | 74 +++++++++++++++++++----- 2 files changed, 85 insertions(+), 36 deletions(-) diff --git a/sumcheck/src/cuda/kernels/multilinear.cu b/sumcheck/src/cuda/kernels/multilinear.cu index 7d8b128..a5a71c5 100644 --- a/sumcheck/src/cuda/kernels/multilinear.cu +++ b/sumcheck/src/cuda/kernels/multilinear.cu @@ -3,31 +3,15 @@ using namespace bb; -extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - fr coeff = coeffs[index].to_montgomery_form(); - if (coeff == fr::zero()) { - monomial_evals[index] = fr::zero(); - } else { - monomial_evals[index] = coeff; - for (int i = 0; i < num_vars; i++) { - if (((index >> i) & 1) == 1) { - monomial_evals[index] *= point[i]; - } - } - } - return; -} - -extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) { +extern "C" __global__ void eval_by_coeff(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) { const int tid = threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x; auto step_size = 1; - int number_of_threads = blockDim.x >> 1; + int num_threads = blockDim.x >> 1; bool evaluated = false; - while (number_of_threads > 0) + while (num_threads > 0) { if (!evaluated) { fr coeff = coeffs[idx].to_montgomery_form(); @@ -40,7 +24,7 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num continue; } - if (tid < number_of_threads) // still alive? + if (tid < num_threads) // still alive? { const auto fst = blockIdx.x * blockDim.x + tid * step_size * 2; const auto snd = fst + step_size; @@ -48,10 +32,31 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num } step_size <<= 1; - number_of_threads >>= 1; + num_threads >>= 1; __syncthreads(); } if (tid == 0) { monomial_evals[idx].self_from_montgomery_form(); } } + +extern "C" __global__ void eval(fr* evals, fr* point, uint8_t num_vars, fr* buf) { + const int tid = threadIdx.x; + auto i = 0; + auto num_threads = 1 << (num_vars - 1); + while (num_threads > 0) { + if (tid < num_threads) { + buf[tid] = point[i] * (evals[2 * tid + 1] - evals[2 * tid]) + evals[2 * tid]; + } + __syncthreads(); + if (tid == 0) { + memcpy(evals, buf, num_threads * 32); + } + i++; + num_threads >>= 1; + __syncthreads(); + } + if (tid == 0) { + buf[0].self_from_montgomery_form(); + } +} diff --git a/sumcheck/src/lib.rs b/sumcheck/src/lib.rs index 8a8acc8..c576ed3 100644 --- a/sumcheck/src/lib.rs +++ b/sumcheck/src/lib.rs @@ -60,6 +60,44 @@ impl + ToFieldBinding> GPUApiWrapper { Ok(()) } + pub fn eval(&self, num_vars: usize, evals: &[F], point: &[F]) -> Result { + let now = Instant::now(); + let point = point + .into_iter() + .map(|f| F::to_montgomery_form(*f)) + .collect_vec(); + + // copy to GPU + let evals = self.gpu.htod_copy( + evals + .into_par_iter() + .map(|&eval| F::to_montgomery_form(eval)) + .collect(), + )?; + let gpu_eval_point = self.gpu.htod_copy(point)?; + let buf = self + .gpu + .htod_copy(vec![FieldBinding::default(); 1 << (num_vars - 1)])?; + println!("Time taken to initialise data: {:.2?}", now.elapsed()); + + let now = Instant::now(); + let eval = self.gpu.get_func("multilinear", "eval").unwrap(); + + unsafe { + eval.launch( + LaunchConfig::for_num_elems(1 << (num_vars - 1) as u32), + (&evals, &gpu_eval_point, num_vars, &buf), + )?; + }; + println!("Time taken to call kernel: {:.2?}", now.elapsed()); + + let now = Instant::now(); + let buf = self.gpu.sync_reclaim(buf)?; + println!("Time taken to synchronize: {:.2?}", now.elapsed()); + + Ok(F::from_canonical_form(buf[0])) + } + pub fn eval_by_coeff( &self, num_vars: usize, @@ -86,13 +124,10 @@ impl + ToFieldBinding> GPUApiWrapper { println!("Time taken to initialise data: {:.2?}", now.elapsed()); let now = Instant::now(); - let evaluate_optimized = self - .gpu - .get_func("multilinear", "evaluate_optimized") - .unwrap(); + let eval_by_coeff = self.gpu.get_func("multilinear", "eval_by_coeff").unwrap(); unsafe { - evaluate_optimized.launch( + eval_by_coeff.launch( LaunchConfig::for_num_elems(1 << num_vars as u32), (&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals), )?; @@ -153,10 +188,6 @@ mod tests { use super::GPUApiWrapper; - fn eval_cpu(evals: &[F], x: &[F]) -> F { - cpu::multilinear::evaluate(evals, x) - } - fn eval_by_coeff_cpu(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F { poly_coeffs .par_iter() @@ -178,7 +209,24 @@ mod tests { #[test] fn test_eval() -> Result<(), DriverError> { - todo!() + let num_vars = 10; + let rng = OsRng::default(); + let evals = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec(); + let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec(); + + let gpu_api_wrapper = GPUApiWrapper::::setup()?; + gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval"])?; + + let now = Instant::now(); + let eval_poly_result_by_cpu = cpu::multilinear::evaluate(&evals, &point); + println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed()); + + let now = Instant::now(); + let eval_poly_result_by_gpu = gpu_api_wrapper.eval(num_vars, &evals, &point)?; + println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed()); + + assert_eq!(eval_poly_result_by_cpu, eval_poly_result_by_gpu); + Ok(()) } #[test] @@ -189,11 +237,7 @@ mod tests { let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec(); let gpu_api_wrapper = GPUApiWrapper::::setup()?; - gpu_api_wrapper.load_ptx( - MULTILINEAR_POLY_KERNEL, - "multilinear", - &["evaluate_optimized"], - )?; + gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval_by_coeff"])?; let now = Instant::now(); let eval_poly_result_by_cpu = eval_by_coeff_cpu(&poly_coeffs, &point, num_vars);