Add kernel for evaluating multilinear polynomial by evaluation form (only works for single thread block)

2026-01-08 23:18:00 -05:00 · 2024-07-08 17:26:25 +00:00
parent cd02649d1f
commit 1e1ea34333
2 changed files with 85 additions and 36 deletions
--- a/sumcheck/src/cuda/kernels/multilinear.cu
+++ b/sumcheck/src/cuda/kernels/multilinear.cu
@@ -3,31 +3,15 @@

 using namespace bb;

-extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    fr coeff = coeffs[index].to_montgomery_form();
-    if (coeff == fr::zero()) {
-        monomial_evals[index] = fr::zero();
-    } else {
-        monomial_evals[index] = coeff;
-        for (int i = 0; i < num_vars; i++) {
-            if (((index >> i) & 1) == 1) {
-                monomial_evals[index] *= point[i];
-            }
-        }
-    }
-    return;
-}
-
-extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
+extern "C" __global__ void eval_by_coeff(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
    const int tid = threadIdx.x;
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;

 	auto step_size = 1;
-	int number_of_threads = blockDim.x >> 1;
+	int num_threads = blockDim.x >> 1;
    bool evaluated = false;

-	while (number_of_threads > 0)
+	while (num_threads > 0)
 	{
        if (!evaluated) {
            fr coeff = coeffs[idx].to_montgomery_form();
@@ -40,7 +24,7 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
            continue;
        }

-		if (tid < number_of_threads) // still alive?
+		if (tid < num_threads) // still alive?
 		{
 			const auto fst = blockIdx.x * blockDim.x + tid * step_size * 2;
 			const auto snd = fst + step_size;
@@ -48,10 +32,31 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
 		}

 		step_size <<= 1;
-		number_of_threads >>= 1;
+		num_threads >>= 1;
        __syncthreads();
 	}
    if (tid == 0) {
        monomial_evals[idx].self_from_montgomery_form();
    }
 }
+
+extern "C" __global__ void eval(fr* evals, fr* point, uint8_t num_vars, fr* buf) {
+    const int tid = threadIdx.x;
+    auto i = 0;
+    auto num_threads = 1 << (num_vars - 1);
+    while (num_threads > 0) {
+        if (tid < num_threads) {
+            buf[tid] = point[i] * (evals[2 * tid + 1] - evals[2 * tid]) + evals[2 * tid];
+        }
+        __syncthreads();
+        if (tid == 0) {
+            memcpy(evals, buf, num_threads * 32);
+        }
+        i++;
+        num_threads >>= 1;
+        __syncthreads();
+    }
+    if (tid == 0) {
+        buf[0].self_from_montgomery_form();
+    }
+}
--- a/sumcheck/src/lib.rs
+++ b/sumcheck/src/lib.rs
@@ -60,6 +60,44 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
        Ok(())
    }

+    pub fn eval(&self, num_vars: usize, evals: &[F], point: &[F]) -> Result<F, DriverError> {
+        let now = Instant::now();
+        let point = point
+            .into_iter()
+            .map(|f| F::to_montgomery_form(*f))
+            .collect_vec();
+
+        // copy to GPU
+        let evals = self.gpu.htod_copy(
+            evals
+                .into_par_iter()
+                .map(|&eval| F::to_montgomery_form(eval))
+                .collect(),
+        )?;
+        let gpu_eval_point = self.gpu.htod_copy(point)?;
+        let buf = self
+            .gpu
+            .htod_copy(vec![FieldBinding::default(); 1 << (num_vars - 1)])?;
+        println!("Time taken to initialise data: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let eval = self.gpu.get_func("multilinear", "eval").unwrap();
+
+        unsafe {
+            eval.launch(
+                LaunchConfig::for_num_elems(1 << (num_vars - 1) as u32),
+                (&evals, &gpu_eval_point, num_vars, &buf),
+            )?;
+        };
+        println!("Time taken to call kernel: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let buf = self.gpu.sync_reclaim(buf)?;
+        println!("Time taken to synchronize: {:.2?}", now.elapsed());
+
+        Ok(F::from_canonical_form(buf[0]))
+    }
+
    pub fn eval_by_coeff(
        &self,
        num_vars: usize,
@@ -86,13 +124,10 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
        println!("Time taken to initialise data: {:.2?}", now.elapsed());

        let now = Instant::now();
-        let evaluate_optimized = self
-            .gpu
-            .get_func("multilinear", "evaluate_optimized")
-            .unwrap();
+        let eval_by_coeff = self.gpu.get_func("multilinear", "eval_by_coeff").unwrap();

        unsafe {
-            evaluate_optimized.launch(
+            eval_by_coeff.launch(
                LaunchConfig::for_num_elems(1 << num_vars as u32),
                (&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
            )?;
@@ -153,10 +188,6 @@ mod tests {

    use super::GPUApiWrapper;

-    fn eval_cpu<F: PrimeField>(evals: &[F], x: &[F]) -> F {
-        cpu::multilinear::evaluate(evals, x)
-    }
-
    fn eval_by_coeff_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
        poly_coeffs
            .par_iter()
@@ -178,7 +209,24 @@ mod tests {

    #[test]
    fn test_eval() -> Result<(), DriverError> {
-        todo!()
+        let num_vars = 10;
+        let rng = OsRng::default();
+        let evals = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
+        let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
+
+        let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
+        gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval"])?;
+
+        let now = Instant::now();
+        let eval_poly_result_by_cpu = cpu::multilinear::evaluate(&evals, &point);
+        println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let eval_poly_result_by_gpu = gpu_api_wrapper.eval(num_vars, &evals, &point)?;
+        println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());
+
+        assert_eq!(eval_poly_result_by_cpu, eval_poly_result_by_gpu);
+        Ok(())
    }

    #[test]
@@ -189,11 +237,7 @@ mod tests {
        let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();

        let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
-        gpu_api_wrapper.load_ptx(
-            MULTILINEAR_POLY_KERNEL,
-            "multilinear",
-            &["evaluate_optimized"],
-        )?;
+        gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval_by_coeff"])?;

        let now = Instant::now();
        let eval_poly_result_by_cpu = eval_by_coeff_cpu(&poly_coeffs, &point, num_vars);