Cleanup

2026-01-08 23:18:00 -05:00 · 2024-06-28 12:16:21 +00:00
parent 1663d6c6f9
commit 0c1e9396cc
5 changed files with 119 additions and 145 deletions
--- a/sumcheck/build.rs
+++ b/sumcheck/build.rs
@@ -10,6 +10,7 @@ fn main() {
    // Tell cargo to invalidate the built crate whenever files of interest changes.
    println!("cargo:rerun-if-changed=src/cuda/kernels/multilinear.cu");
    println!("cargo:rerun-if-changed=src/cuda/kernels/sumcheck.cu");
+    println!("cargo:rerun-if-changed=src/cuda/kernels/scalar_multiplication.cu");

    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());

@@ -20,27 +21,33 @@ fn main() {
    let language_std = "c++20"; // Language standard in which device functions are written

    // build the cuda kernels
-    let cuda_src = PathBuf::from("src/cuda/kernels/multilinear.cu");
-    let ptx_file = out_dir.join("multilinear.ptx");
+    let cuda_src = [
+        "src/cuda/kernels/multilinear.cu",
+        "src/cuda/kernels/scalar_multiplication.cu",
+    ]
+    .map(|path| PathBuf::from(path));
+    let ptx_file = ["multilinear.ptx", "scalar_multiplication.ptx"].map(|file| out_dir.join(file));

-    let nvcc_status = Command::new("nvcc")
-        .arg("-ptx")
-        .arg("-o")
-        .arg(&ptx_file)
-        .arg(&cuda_src)
-        .arg(format!("-arch={}", arch))
-        .arg(format!("-code={}", code))
-        .arg(format!("-ccbin={}", compiler))
-        .arg(format!("-std={}", language_std))
-        .arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
-        .arg("--expt-relaxed-constexpr")
-        .status()
-        .unwrap();
+    for (cuda_src, ptx_file) in cuda_src.into_iter().zip(ptx_file) {
+        let nvcc_status = Command::new("nvcc")
+            .arg("-ptx")
+            .arg("-o")
+            .arg(&ptx_file)
+            .arg(&cuda_src)
+            .arg(format!("-arch={}", arch))
+            .arg(format!("-code={}", code))
+            .arg(format!("-ccbin={}", compiler))
+            .arg(format!("-std={}", language_std))
+            .arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
+            .arg("--expt-relaxed-constexpr")
+            .status()
+            .unwrap();

-    assert!(
-        nvcc_status.success(),
-        "Failed to compile CUDA source to PTX."
-    );
+        assert!(
+            nvcc_status.success(),
+            "Failed to compile CUDA source to PTX."
+        );
+    }

    // The bindgen::Builder is the main entry point
    // to bindgen, and lets you build up options for
--- a/sumcheck/src/cuda/includes/test.cpp
+++ b/sumcheck/src/cuda/includes/test.cpp
@@ -1,25 +0,0 @@
-#include <cassert>
-#include <iostream>
-#include <cstring>
-
-#include "prime_field.h"
-#include "./barretenberg/ecc/curves/bn254/fq.hpp"
-
-int main() {
-    bb::fq a = bb::fq(1UL);
-    bb::fq b = bb::fq(2U);
-    bb::fq c = a + b;
-    assert(c == bb::fq(3U));
-    c = a * b;
-    assert(c == bb::fq(2U));
-    // memory layout test
-    struct Field val_c { { 1UL, 2UL, 3UL, 4UL } };
-    bb::fq val;
-    std::memcpy(&val, &val_c, 32);
-    assert(val.data[0] == 1UL);
-    assert(val.data[1] == 2UL);
-    assert(val.data[2] == 3UL);
-    assert(val.data[3] == 4UL);
-    std::cout << "test ended" << std::endl;
-    return 0;
-}
--- a/sumcheck/src/cuda/kernels/multilinear.cu
+++ b/sumcheck/src/cuda/kernels/multilinear.cu
@@ -19,7 +19,7 @@ extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr*
    return;
 }

-extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals, fr* result, int* mutex) {
+extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
    const int tid = threadIdx.x;
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;

--- a/sumcheck/src/cuda/kernels/scalar_multiplication.cu
+++ b/sumcheck/src/cuda/kernels/scalar_multiplication.cu
@@ -3,8 +3,6 @@
 using namespace bb;

 extern "C" __global__ void mul(fr* elems, fr* results) {
-    elems[0].self_to_montgomery_form();
-    elems[1].self_to_montgomery_form();
    fr temp = elems[0] * elems[1];
    results[threadIdx.x] = temp.from_montgomery_form();
    return;
--- a/sumcheck/src/lib.rs
+++ b/sumcheck/src/lib.rs
@@ -5,6 +5,7 @@ use field::{FromFieldBinding, ToFieldBinding};
 use itertools::Itertools;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::marker::PhantomData;
+use std::sync::Arc;
 use std::time::Instant;

 pub mod field;
@@ -20,33 +21,47 @@ impl Default for FieldBinding {

 // include the compiled PTX code as string
 const MULTILINEAR_POLY_KERNEL: &str = include_str!(concat!(env!("OUT_DIR"), "/multilinear.ptx"));
+const SCALAR_MULTIPLICATION_KERNEL: &str =
+    include_str!(concat!(env!("OUT_DIR"), "/scalar_multiplication.ptx"));

 /// Wrapper struct for APIs using GPU
-#[derive(Default)]
-pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>>(PhantomData<F>);
+pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> {
+    gpu: Arc<CudaDevice>,
+    _marker: PhantomData<F>,
+}

 impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
+    pub fn setup() -> Result<Self, DriverError> {
+        // setup GPU device
+        let now = Instant::now();
+        let gpu = CudaDevice::new(0)?;
+        println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
+        Ok(Self {
+            gpu,
+            _marker: PhantomData,
+        })
+    }
+
+    pub fn load_ptx(
+        &self,
+        ptx: &str,
+        module_name: &str,
+        func_names: &[&'static str],
+    ) -> Result<(), DriverError> {
+        // compile ptx
+        let now = Instant::now();
+        let ptx = Ptx::from_src(ptx);
+        self.gpu.load_ptx(ptx, module_name, &func_names)?;
+        println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
+        Ok(())
+    }
+
    pub fn evaluate_poly(
        &self,
        num_vars: usize,
        poly_coeffs: &[F],
        point: &[F],
    ) -> Result<F, DriverError> {
-        // setup GPU device
-        let now = Instant::now();
-
-        let gpu = CudaDevice::new(0)?;
-
-        println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
-
-        // compile ptx
-        let now = Instant::now();
-
-        let ptx = Ptx::from_src(MULTILINEAR_POLY_KERNEL);
-        gpu.load_ptx(ptx, "multilinear", &["evaluate", "evaluate_optimized"])?;
-
-        println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
-
        let now = Instant::now();
        let point_montgomery = point
            .into_iter()
@@ -54,41 +69,34 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
            .collect_vec();

        // copy to GPU
-        let gpu_coeffs = gpu.htod_copy(
+        let gpu_coeffs = self.gpu.htod_copy(
            poly_coeffs
                .into_par_iter()
                .map(|&coeff| F::to_canonical_form(coeff))
                .collect(),
        )?;
-        let gpu_eval_point = gpu.htod_copy(point_montgomery)?;
-        let monomial_evals = gpu.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
-        let mutex = gpu.alloc_zeros::<u32>(1)?;
-        let result = gpu.htod_copy(vec![FieldBinding::default(); 1])?;
-
+        let gpu_eval_point = self.gpu.htod_copy(point_montgomery)?;
+        let monomial_evals = self
+            .gpu
+            .htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
        println!("Time taken to initialise data: {:.2?}", now.elapsed());

        let now = Instant::now();
-
-        let evaluate_optimized = gpu.get_func("multilinear", "evaluate_optimized").unwrap();
+        let evaluate_optimized = self
+            .gpu
+            .get_func("multilinear", "evaluate_optimized")
+            .unwrap();

        unsafe {
            evaluate_optimized.launch(
                LaunchConfig::for_num_elems(1 << num_vars as u32),
-                (
-                    &gpu_coeffs,
-                    &gpu_eval_point,
-                    num_vars,
-                    &monomial_evals,
-                    &result,
-                    &mutex,
-                ),
+                (&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
            )?;
        };
-
        println!("Time taken to call kernel: {:.2?}", now.elapsed());

        let now = Instant::now();
-        let monomial_evals = gpu.sync_reclaim(monomial_evals)?;
+        let monomial_evals = self.gpu.sync_reclaim(monomial_evals)?;
        println!("Time taken to synchronize: {:.2?}", now.elapsed());

        let now = Instant::now();
@@ -100,27 +108,48 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
        println!("Time taken to calculate sum: {:.2?}", now.elapsed());
        Ok(result)
    }
+
+    pub fn mul(&self, values: &[F; 2]) -> Result<F, DriverError> {
+        let now = Instant::now();
+        let gpu_values = self
+            .gpu
+            .htod_copy(values.map(|v| F::to_montgomery_form(v)).to_vec())?;
+        let results = self.gpu.htod_copy(vec![FieldBinding::default(); 1])?;
+        println!("Time taken to initialise data: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let mul = self.gpu.get_func("scalar_multiplication", "mul").unwrap();
+        unsafe {
+            mul.launch(
+                LaunchConfig::for_num_elems(1 as u32),
+                (&gpu_values, &results)
+            )?;
+        }
+        println!("Time taken to call kernel: {:.2?}", now.elapsed());
+
+        let now = Instant::now();
+        let results = self.gpu.sync_reclaim(results)?;
+        println!("Time taken to synchronize: {:.2?}", now.elapsed());
+        Ok(F::from_canonical_form(results[0]))
+    }
 }

 #[cfg(test)]
 mod tests {
-    use std::{cmp::Ordering, default, fmt::Error, time::Instant};
+    use std::time::Instant;

-    use cudarc::{
-        driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig},
-        nvrtc::Ptx,
-    };
+    use cudarc::driver::DriverError;
    use ff::{Field, PrimeField};
    use halo2curves::bn256::Fr;
    use itertools::Itertools;
    use rand::rngs::OsRng;
    use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};

-    use crate::{field::{FromFieldBinding, ToFieldBinding}, FieldBinding, MULTILINEAR_POLY_KERNEL};
+    use crate::{MULTILINEAR_POLY_KERNEL, SCALAR_MULTIPLICATION_KERNEL};

    use super::GPUApiWrapper;

-    fn evaluate_poly_cpu<F: Field>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
+    fn evaluate_poly_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
        poly_coeffs
            .par_iter()
            .enumerate()
@@ -141,11 +170,18 @@ mod tests {

    #[test]
    fn test_evaluate_poly() -> Result<(), DriverError> {
-        let num_vars = 22;
+        let num_vars = 18;
        let rng = OsRng::default();
        let poly_coeffs = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
        let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
-        let gpu_api_wrapper = GPUApiWrapper::<Fr>::default();
+
+        let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
+        gpu_api_wrapper.load_ptx(
+            MULTILINEAR_POLY_KERNEL,
+            "multilinear",
+            &["evaluate_optimized"],
+        )?;
+
        let now = Instant::now();
        let eval_poly_result_by_cpu = evaluate_poly_cpu(&poly_coeffs, &point, num_vars);
        println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
@@ -161,64 +197,22 @@ mod tests {

    #[test]
    fn test_scalar_multiplication() -> Result<(), DriverError> {
-        // setup GPU device
-        let now = Instant::now();
+        let rng = OsRng::default();
+        let values = [(); 2].map(|_| Fr::random(rng));
+        let expected = values[0] * values[1];

-        let gpu = CudaDevice::new(0)?;
-
-        println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
-
-        // compile ptx
-        let now = Instant::now();
-
-        let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
-        gpu.load_ptx(ptx, "my_module", &["mul"])?;
-
-        println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
-
-        let a = Fr::from(2);
-        let b = Fr::TWO_INV;
-
-        println!("a * b : {:?}", a * b);
-
-        let a_data = FieldBinding { data: [2, 0, 0, 0] };
-
-        let b_data = FieldBinding {
-            data: [
-                0xa1f0fac9f8000001,
-                0x9419f4243cdcb848,
-                0xdc2822db40c0ac2e,
-                0x183227397098d014,
-            ],
-        };
-
-        // copy to GPU
-        let gpu_field_structs = gpu.htod_copy(vec![a_data, b_data])?;
-        let results = gpu.htod_copy(vec![FieldBinding::default(); 1024])?;
-
-        println!("Time taken to initialise data: {:.2?}", now.elapsed());
+        let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
+        gpu_api_wrapper.load_ptx(
+            SCALAR_MULTIPLICATION_KERNEL,
+            "scalar_multiplication",
+            &["mul"],
+        )?;

        let now = Instant::now();
+        let actual = gpu_api_wrapper.mul(&values)?;
+        println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());

-        let f = gpu.get_func("my_module", "mul").unwrap();
-
-        unsafe {
-            f.launch(
-                LaunchConfig::for_num_elems(1024 as u32),
-                (&gpu_field_structs, &results),
-            )
-        }?;
-
-        println!("Time taken to call kernel: {:.2?}", now.elapsed());
-
-        let results = gpu.sync_reclaim(results)?;
-
-        results.iter().for_each(|result| {
-            assert_eq!(result.data[0], 1);
-            assert_eq!(result.data[1], 0);
-            assert_eq!(result.data[1], 0);
-            assert_eq!(result.data[1], 0);
-        });
+        assert_eq!(actual, expected);
        Ok(())
    }
 }