diff --git a/sumcheck/build.rs b/sumcheck/build.rs index a1be0d5..01f4fde 100644 --- a/sumcheck/build.rs +++ b/sumcheck/build.rs @@ -10,6 +10,7 @@ fn main() { // Tell cargo to invalidate the built crate whenever files of interest changes. println!("cargo:rerun-if-changed=src/cuda/kernels/multilinear.cu"); println!("cargo:rerun-if-changed=src/cuda/kernels/sumcheck.cu"); + println!("cargo:rerun-if-changed=src/cuda/kernels/scalar_multiplication.cu"); let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); @@ -20,27 +21,33 @@ fn main() { let language_std = "c++20"; // Language standard in which device functions are written // build the cuda kernels - let cuda_src = PathBuf::from("src/cuda/kernels/multilinear.cu"); - let ptx_file = out_dir.join("multilinear.ptx"); + let cuda_src = [ + "src/cuda/kernels/multilinear.cu", + "src/cuda/kernels/scalar_multiplication.cu", + ] + .map(|path| PathBuf::from(path)); + let ptx_file = ["multilinear.ptx", "scalar_multiplication.ptx"].map(|file| out_dir.join(file)); - let nvcc_status = Command::new("nvcc") - .arg("-ptx") - .arg("-o") - .arg(&ptx_file) - .arg(&cuda_src) - .arg(format!("-arch={}", arch)) - .arg(format!("-code={}", code)) - .arg(format!("-ccbin={}", compiler)) - .arg(format!("-std={}", language_std)) - .arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc - .arg("--expt-relaxed-constexpr") - .status() - .unwrap(); + for (cuda_src, ptx_file) in cuda_src.into_iter().zip(ptx_file) { + let nvcc_status = Command::new("nvcc") + .arg("-ptx") + .arg("-o") + .arg(&ptx_file) + .arg(&cuda_src) + .arg(format!("-arch={}", arch)) + .arg(format!("-code={}", code)) + .arg(format!("-ccbin={}", compiler)) + .arg(format!("-std={}", language_std)) + .arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc + .arg("--expt-relaxed-constexpr") + .status() + .unwrap(); - assert!( - nvcc_status.success(), - "Failed to compile CUDA source to PTX." - ); + assert!( + nvcc_status.success(), + "Failed to compile CUDA source to PTX." + ); + } // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for diff --git a/sumcheck/src/cuda/includes/test.cpp b/sumcheck/src/cuda/includes/test.cpp deleted file mode 100644 index dfdfd1b..0000000 --- a/sumcheck/src/cuda/includes/test.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include -#include -#include - -#include "prime_field.h" -#include "./barretenberg/ecc/curves/bn254/fq.hpp" - -int main() { - bb::fq a = bb::fq(1UL); - bb::fq b = bb::fq(2U); - bb::fq c = a + b; - assert(c == bb::fq(3U)); - c = a * b; - assert(c == bb::fq(2U)); - // memory layout test - struct Field val_c { { 1UL, 2UL, 3UL, 4UL } }; - bb::fq val; - std::memcpy(&val, &val_c, 32); - assert(val.data[0] == 1UL); - assert(val.data[1] == 2UL); - assert(val.data[2] == 3UL); - assert(val.data[3] == 4UL); - std::cout << "test ended" << std::endl; - return 0; -} diff --git a/sumcheck/src/cuda/kernels/multilinear.cu b/sumcheck/src/cuda/kernels/multilinear.cu index cfdcb76..7d8b128 100644 --- a/sumcheck/src/cuda/kernels/multilinear.cu +++ b/sumcheck/src/cuda/kernels/multilinear.cu @@ -19,7 +19,7 @@ extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr* return; } -extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals, fr* result, int* mutex) { +extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) { const int tid = threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/sumcheck/src/cuda/kernels/scalar_multiplication.cu b/sumcheck/src/cuda/kernels/scalar_multiplication.cu index 58530ca..0236703 100644 --- a/sumcheck/src/cuda/kernels/scalar_multiplication.cu +++ b/sumcheck/src/cuda/kernels/scalar_multiplication.cu @@ -3,8 +3,6 @@ using namespace bb; extern "C" __global__ void mul(fr* elems, fr* results) { - elems[0].self_to_montgomery_form(); - elems[1].self_to_montgomery_form(); fr temp = elems[0] * elems[1]; results[threadIdx.x] = temp.from_montgomery_form(); return; diff --git a/sumcheck/src/lib.rs b/sumcheck/src/lib.rs index b278880..6027a52 100644 --- a/sumcheck/src/lib.rs +++ b/sumcheck/src/lib.rs @@ -5,6 +5,7 @@ use field::{FromFieldBinding, ToFieldBinding}; use itertools::Itertools; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use std::marker::PhantomData; +use std::sync::Arc; use std::time::Instant; pub mod field; @@ -20,33 +21,47 @@ impl Default for FieldBinding { // include the compiled PTX code as string const MULTILINEAR_POLY_KERNEL: &str = include_str!(concat!(env!("OUT_DIR"), "/multilinear.ptx")); +const SCALAR_MULTIPLICATION_KERNEL: &str = + include_str!(concat!(env!("OUT_DIR"), "/scalar_multiplication.ptx")); /// Wrapper struct for APIs using GPU -#[derive(Default)] -pub struct GPUApiWrapper + ToFieldBinding>(PhantomData); +pub struct GPUApiWrapper + ToFieldBinding> { + gpu: Arc, + _marker: PhantomData, +} impl + ToFieldBinding> GPUApiWrapper { + pub fn setup() -> Result { + // setup GPU device + let now = Instant::now(); + let gpu = CudaDevice::new(0)?; + println!("Time taken to initialise CUDA: {:.2?}", now.elapsed()); + Ok(Self { + gpu, + _marker: PhantomData, + }) + } + + pub fn load_ptx( + &self, + ptx: &str, + module_name: &str, + func_names: &[&'static str], + ) -> Result<(), DriverError> { + // compile ptx + let now = Instant::now(); + let ptx = Ptx::from_src(ptx); + self.gpu.load_ptx(ptx, module_name, &func_names)?; + println!("Time taken to compile and load PTX: {:.2?}", now.elapsed()); + Ok(()) + } + pub fn evaluate_poly( &self, num_vars: usize, poly_coeffs: &[F], point: &[F], ) -> Result { - // setup GPU device - let now = Instant::now(); - - let gpu = CudaDevice::new(0)?; - - println!("Time taken to initialise CUDA: {:.2?}", now.elapsed()); - - // compile ptx - let now = Instant::now(); - - let ptx = Ptx::from_src(MULTILINEAR_POLY_KERNEL); - gpu.load_ptx(ptx, "multilinear", &["evaluate", "evaluate_optimized"])?; - - println!("Time taken to compile and load PTX: {:.2?}", now.elapsed()); - let now = Instant::now(); let point_montgomery = point .into_iter() @@ -54,41 +69,34 @@ impl + ToFieldBinding> GPUApiWrapper { .collect_vec(); // copy to GPU - let gpu_coeffs = gpu.htod_copy( + let gpu_coeffs = self.gpu.htod_copy( poly_coeffs .into_par_iter() .map(|&coeff| F::to_canonical_form(coeff)) .collect(), )?; - let gpu_eval_point = gpu.htod_copy(point_montgomery)?; - let monomial_evals = gpu.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?; - let mutex = gpu.alloc_zeros::(1)?; - let result = gpu.htod_copy(vec![FieldBinding::default(); 1])?; - + let gpu_eval_point = self.gpu.htod_copy(point_montgomery)?; + let monomial_evals = self + .gpu + .htod_copy(vec![FieldBinding::default(); 1 << num_vars])?; println!("Time taken to initialise data: {:.2?}", now.elapsed()); let now = Instant::now(); - - let evaluate_optimized = gpu.get_func("multilinear", "evaluate_optimized").unwrap(); + let evaluate_optimized = self + .gpu + .get_func("multilinear", "evaluate_optimized") + .unwrap(); unsafe { evaluate_optimized.launch( LaunchConfig::for_num_elems(1 << num_vars as u32), - ( - &gpu_coeffs, - &gpu_eval_point, - num_vars, - &monomial_evals, - &result, - &mutex, - ), + (&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals), )?; }; - println!("Time taken to call kernel: {:.2?}", now.elapsed()); let now = Instant::now(); - let monomial_evals = gpu.sync_reclaim(monomial_evals)?; + let monomial_evals = self.gpu.sync_reclaim(monomial_evals)?; println!("Time taken to synchronize: {:.2?}", now.elapsed()); let now = Instant::now(); @@ -100,27 +108,48 @@ impl + ToFieldBinding> GPUApiWrapper { println!("Time taken to calculate sum: {:.2?}", now.elapsed()); Ok(result) } + + pub fn mul(&self, values: &[F; 2]) -> Result { + let now = Instant::now(); + let gpu_values = self + .gpu + .htod_copy(values.map(|v| F::to_montgomery_form(v)).to_vec())?; + let results = self.gpu.htod_copy(vec![FieldBinding::default(); 1])?; + println!("Time taken to initialise data: {:.2?}", now.elapsed()); + + let now = Instant::now(); + let mul = self.gpu.get_func("scalar_multiplication", "mul").unwrap(); + unsafe { + mul.launch( + LaunchConfig::for_num_elems(1 as u32), + (&gpu_values, &results) + )?; + } + println!("Time taken to call kernel: {:.2?}", now.elapsed()); + + let now = Instant::now(); + let results = self.gpu.sync_reclaim(results)?; + println!("Time taken to synchronize: {:.2?}", now.elapsed()); + Ok(F::from_canonical_form(results[0])) + } } #[cfg(test)] mod tests { - use std::{cmp::Ordering, default, fmt::Error, time::Instant}; + use std::time::Instant; - use cudarc::{ - driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig}, - nvrtc::Ptx, - }; + use cudarc::driver::DriverError; use ff::{Field, PrimeField}; use halo2curves::bn256::Fr; use itertools::Itertools; use rand::rngs::OsRng; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; - use crate::{field::{FromFieldBinding, ToFieldBinding}, FieldBinding, MULTILINEAR_POLY_KERNEL}; + use crate::{MULTILINEAR_POLY_KERNEL, SCALAR_MULTIPLICATION_KERNEL}; use super::GPUApiWrapper; - fn evaluate_poly_cpu(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F { + fn evaluate_poly_cpu(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F { poly_coeffs .par_iter() .enumerate() @@ -141,11 +170,18 @@ mod tests { #[test] fn test_evaluate_poly() -> Result<(), DriverError> { - let num_vars = 22; + let num_vars = 18; let rng = OsRng::default(); let poly_coeffs = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec(); let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec(); - let gpu_api_wrapper = GPUApiWrapper::::default(); + + let gpu_api_wrapper = GPUApiWrapper::::setup()?; + gpu_api_wrapper.load_ptx( + MULTILINEAR_POLY_KERNEL, + "multilinear", + &["evaluate_optimized"], + )?; + let now = Instant::now(); let eval_poly_result_by_cpu = evaluate_poly_cpu(&poly_coeffs, &point, num_vars); println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed()); @@ -161,64 +197,22 @@ mod tests { #[test] fn test_scalar_multiplication() -> Result<(), DriverError> { - // setup GPU device - let now = Instant::now(); + let rng = OsRng::default(); + let values = [(); 2].map(|_| Fr::random(rng)); + let expected = values[0] * values[1]; - let gpu = CudaDevice::new(0)?; - - println!("Time taken to initialise CUDA: {:.2?}", now.elapsed()); - - // compile ptx - let now = Instant::now(); - - let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT); - gpu.load_ptx(ptx, "my_module", &["mul"])?; - - println!("Time taken to compile and load PTX: {:.2?}", now.elapsed()); - - let a = Fr::from(2); - let b = Fr::TWO_INV; - - println!("a * b : {:?}", a * b); - - let a_data = FieldBinding { data: [2, 0, 0, 0] }; - - let b_data = FieldBinding { - data: [ - 0xa1f0fac9f8000001, - 0x9419f4243cdcb848, - 0xdc2822db40c0ac2e, - 0x183227397098d014, - ], - }; - - // copy to GPU - let gpu_field_structs = gpu.htod_copy(vec![a_data, b_data])?; - let results = gpu.htod_copy(vec![FieldBinding::default(); 1024])?; - - println!("Time taken to initialise data: {:.2?}", now.elapsed()); + let gpu_api_wrapper = GPUApiWrapper::::setup()?; + gpu_api_wrapper.load_ptx( + SCALAR_MULTIPLICATION_KERNEL, + "scalar_multiplication", + &["mul"], + )?; let now = Instant::now(); + let actual = gpu_api_wrapper.mul(&values)?; + println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed()); - let f = gpu.get_func("my_module", "mul").unwrap(); - - unsafe { - f.launch( - LaunchConfig::for_num_elems(1024 as u32), - (&gpu_field_structs, &results), - ) - }?; - - println!("Time taken to call kernel: {:.2?}", now.elapsed()); - - let results = gpu.sync_reclaim(results)?; - - results.iter().for_each(|result| { - assert_eq!(result.data[0], 1); - assert_eq!(result.data[1], 0); - assert_eq!(result.data[1], 0); - assert_eq!(result.data[1], 0); - }); + assert_eq!(actual, expected); Ok(()) } }