From 1e1ea3433368794b31d1cfcec87c7f3889ecce2c Mon Sep 17 00:00:00 2001
From: DoHoonKim8 <dohoon1097819@gmail.com>
Date: Mon, 8 Jul 2024 17:26:25 +0000
Subject: [PATCH] Add kernel for evaluating multilinear polynomial by
 evaluation form (only works for single thread block)

---
 sumcheck/src/cuda/kernels/multilinear.cu | 47 ++++++++-------
 sumcheck/src/lib.rs                      | 74 +++++++++++++++++++-----
 2 files changed, 85 insertions(+), 36 deletions(-)
diff --git a/sumcheck/src/cuda/kernels/multilinear.cu b/sumcheck/src/cuda/kernels/multilinear.cu
index 7d8b128..a5a71c5 100644
--- a/sumcheck/src/cuda/kernels/multilinear.cu
+++ b/sumcheck/src/cuda/kernels/multilinear.cu
@@ -3,31 +3,15 @@
 
 using namespace bb;
 
-extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    fr coeff = coeffs[index].to_montgomery_form();
-    if (coeff == fr::zero()) {
-        monomial_evals[index] = fr::zero();
-    } else {
-        monomial_evals[index] = coeff;
-        for (int i = 0; i < num_vars; i++) {
-            if (((index >> i) & 1) == 1) {
-                monomial_evals[index] *= point[i];
-            }
-        }
-    }
-    return;
-}
-
-extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
+extern "C" __global__ void eval_by_coeff(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
     const int tid = threadIdx.x;
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
 	auto step_size = 1;
-	int number_of_threads = blockDim.x >> 1;
+	int num_threads = blockDim.x >> 1;
     bool evaluated = false;
 
-	while (number_of_threads > 0)
+	while (num_threads > 0)
 	{
         if (!evaluated) {
             fr coeff = coeffs[idx].to_montgomery_form();
@@ -40,7 +24,7 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
             continue;
         }
 
-		if (tid < number_of_threads) // still alive?
+		if (tid < num_threads) // still alive?
 		{
 			const auto fst = blockIdx.x * blockDim.x + tid * step_size * 2;
 			const auto snd = fst + step_size;
@@ -48,10 +32,31 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
 		}
 
 		step_size <<= 1;
-		number_of_threads >>= 1;
+		num_threads >>= 1;
         __syncthreads();
 	}
     if (tid == 0) {
         monomial_evals[idx].self_from_montgomery_form();
     }
 }
+
+extern "C" __global__ void eval(fr* evals, fr* point, uint8_t num_vars, fr* buf) {
+    const int tid = threadIdx.x;
+    auto i = 0;
+    auto num_threads = 1 << (num_vars - 1);
+    while (num_threads > 0) {
+        if (tid < num_threads) {
+            buf[tid] = point[i] * (evals[2 * tid + 1] - evals[2 * tid]) + evals[2 * tid];
+        }
+        __syncthreads();
+        if (tid == 0) {
+            memcpy(evals, buf, num_threads * 32);
+        }
+        i++;
+        num_threads >>= 1;
+        __syncthreads();
+    }
+    if (tid == 0) {
+        buf[0].self_from_montgomery_form();
+    }
+}
diff --git a/sumcheck/src/lib.rs b/sumcheck/src/lib.rs
index 8a8acc8..c576ed3 100644
--- a/sumcheck/src/lib.rs
+++ b/sumcheck/src/lib.rs
@@ -60,6 +60,44 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
         Ok(())
     }
 
+    pub fn eval(&self, num_vars: usize, evals: &[F], point: &[F]) -> Result<F, DriverError> {
+        let now = Instant::now();
+        let point = point
+            .into_iter()
+            .map(|f| F::to_montgomery_form(*f))
+            .collect_vec();
+
+        // copy to GPU
+        let evals = self.gpu.htod_copy(
+            evals
+                .into_par_iter()
+                .map(|&eval| F::to_montgomery_form(eval))
+                .collect(),
+        )?;
+        let gpu_eval_point = self.gpu.htod_copy(point)?;
+        let buf = self
+            .gpu
+            .htod_copy(vec![FieldBinding::default(); 1 << (num_vars - 1)])?;
+        println!("Time taken to initialise data: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let eval = self.gpu.get_func("multilinear", "eval").unwrap();
+
+        unsafe {
+            eval.launch(
+                LaunchConfig::for_num_elems(1 << (num_vars - 1) as u32),
+                (&evals, &gpu_eval_point, num_vars, &buf),
+            )?;
+        };
+        println!("Time taken to call kernel: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let buf = self.gpu.sync_reclaim(buf)?;
+        println!("Time taken to synchronize: {:.2?}", now.elapsed());
+
+        Ok(F::from_canonical_form(buf[0]))
+    }
+
     pub fn eval_by_coeff(
         &self,
         num_vars: usize,
@@ -86,13 +124,10 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
         println!("Time taken to initialise data: {:.2?}", now.elapsed());
 
         let now = Instant::now();
-        let evaluate_optimized = self
-            .gpu
-            .get_func("multilinear", "evaluate_optimized")
-            .unwrap();
+        let eval_by_coeff = self.gpu.get_func("multilinear", "eval_by_coeff").unwrap();
 
         unsafe {
-            evaluate_optimized.launch(
+            eval_by_coeff.launch(
                 LaunchConfig::for_num_elems(1 << num_vars as u32),
                 (&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
             )?;
@@ -153,10 +188,6 @@ mod tests {
 
     use super::GPUApiWrapper;
 
-    fn eval_cpu<F: PrimeField>(evals: &[F], x: &[F]) -> F {
-        cpu::multilinear::evaluate(evals, x)
-    }
-
     fn eval_by_coeff_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
         poly_coeffs
             .par_iter()
@@ -178,7 +209,24 @@ mod tests {
 
     #[test]
     fn test_eval() -> Result<(), DriverError> {
-        todo!()
+        let num_vars = 10;
+        let rng = OsRng::default();
+        let evals = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
+        let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
+
+        let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
+        gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval"])?;
+
+        let now = Instant::now();
+        let eval_poly_result_by_cpu = cpu::multilinear::evaluate(&evals, &point);
+        println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let eval_poly_result_by_gpu = gpu_api_wrapper.eval(num_vars, &evals, &point)?;
+        println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());
+
+        assert_eq!(eval_poly_result_by_cpu, eval_poly_result_by_gpu);
+        Ok(())
     }
 
     #[test]
@@ -189,11 +237,7 @@ mod tests {
         let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
 
         let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
-        gpu_api_wrapper.load_ptx(
-            MULTILINEAR_POLY_KERNEL,
-            "multilinear",
-            &["evaluate_optimized"],
-        )?;
+        gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval_by_coeff"])?;
 
         let now = Instant::now();
         let eval_poly_result_by_cpu = eval_by_coeff_cpu(&poly_coeffs, &point, num_vars);