This commit is contained in:
DoHoonKim8
2024-06-28 12:16:21 +00:00
committed by DoHoon Kim
parent 1663d6c6f9
commit 0c1e9396cc
5 changed files with 119 additions and 145 deletions

View File

@@ -10,6 +10,7 @@ fn main() {
// Tell cargo to invalidate the built crate whenever files of interest changes.
println!("cargo:rerun-if-changed=src/cuda/kernels/multilinear.cu");
println!("cargo:rerun-if-changed=src/cuda/kernels/sumcheck.cu");
println!("cargo:rerun-if-changed=src/cuda/kernels/scalar_multiplication.cu");
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
@@ -20,27 +21,33 @@ fn main() {
let language_std = "c++20"; // Language standard in which device functions are written
// build the cuda kernels
let cuda_src = PathBuf::from("src/cuda/kernels/multilinear.cu");
let ptx_file = out_dir.join("multilinear.ptx");
let cuda_src = [
"src/cuda/kernels/multilinear.cu",
"src/cuda/kernels/scalar_multiplication.cu",
]
.map(|path| PathBuf::from(path));
let ptx_file = ["multilinear.ptx", "scalar_multiplication.ptx"].map(|file| out_dir.join(file));
let nvcc_status = Command::new("nvcc")
.arg("-ptx")
.arg("-o")
.arg(&ptx_file)
.arg(&cuda_src)
.arg(format!("-arch={}", arch))
.arg(format!("-code={}", code))
.arg(format!("-ccbin={}", compiler))
.arg(format!("-std={}", language_std))
.arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
.arg("--expt-relaxed-constexpr")
.status()
.unwrap();
for (cuda_src, ptx_file) in cuda_src.into_iter().zip(ptx_file) {
let nvcc_status = Command::new("nvcc")
.arg("-ptx")
.arg("-o")
.arg(&ptx_file)
.arg(&cuda_src)
.arg(format!("-arch={}", arch))
.arg(format!("-code={}", code))
.arg(format!("-ccbin={}", compiler))
.arg(format!("-std={}", language_std))
.arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
.arg("--expt-relaxed-constexpr")
.status()
.unwrap();
assert!(
nvcc_status.success(),
"Failed to compile CUDA source to PTX."
);
assert!(
nvcc_status.success(),
"Failed to compile CUDA source to PTX."
);
}
// The bindgen::Builder is the main entry point
// to bindgen, and lets you build up options for

View File

@@ -1,25 +0,0 @@
#include <cassert>
#include <iostream>
#include <cstring>
#include "prime_field.h"
#include "./barretenberg/ecc/curves/bn254/fq.hpp"
int main() {
bb::fq a = bb::fq(1UL);
bb::fq b = bb::fq(2U);
bb::fq c = a + b;
assert(c == bb::fq(3U));
c = a * b;
assert(c == bb::fq(2U));
// memory layout test
struct Field val_c { { 1UL, 2UL, 3UL, 4UL } };
bb::fq val;
std::memcpy(&val, &val_c, 32);
assert(val.data[0] == 1UL);
assert(val.data[1] == 2UL);
assert(val.data[2] == 3UL);
assert(val.data[3] == 4UL);
std::cout << "test ended" << std::endl;
return 0;
}

View File

@@ -19,7 +19,7 @@ extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr*
return;
}
extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals, fr* result, int* mutex) {
extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
const int tid = threadIdx.x;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;

View File

@@ -3,8 +3,6 @@
using namespace bb;
extern "C" __global__ void mul(fr* elems, fr* results) {
elems[0].self_to_montgomery_form();
elems[1].self_to_montgomery_form();
fr temp = elems[0] * elems[1];
results[threadIdx.x] = temp.from_montgomery_form();
return;

View File

@@ -5,6 +5,7 @@ use field::{FromFieldBinding, ToFieldBinding};
use itertools::Itertools;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::Instant;
pub mod field;
@@ -20,33 +21,47 @@ impl Default for FieldBinding {
// include the compiled PTX code as string
const MULTILINEAR_POLY_KERNEL: &str = include_str!(concat!(env!("OUT_DIR"), "/multilinear.ptx"));
const SCALAR_MULTIPLICATION_KERNEL: &str =
include_str!(concat!(env!("OUT_DIR"), "/scalar_multiplication.ptx"));
/// Wrapper struct for APIs using GPU
#[derive(Default)]
pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>>(PhantomData<F>);
pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> {
gpu: Arc<CudaDevice>,
_marker: PhantomData<F>,
}
impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
pub fn setup() -> Result<Self, DriverError> {
// setup GPU device
let now = Instant::now();
let gpu = CudaDevice::new(0)?;
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
Ok(Self {
gpu,
_marker: PhantomData,
})
}
pub fn load_ptx(
&self,
ptx: &str,
module_name: &str,
func_names: &[&'static str],
) -> Result<(), DriverError> {
// compile ptx
let now = Instant::now();
let ptx = Ptx::from_src(ptx);
self.gpu.load_ptx(ptx, module_name, &func_names)?;
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
Ok(())
}
pub fn evaluate_poly(
&self,
num_vars: usize,
poly_coeffs: &[F],
point: &[F],
) -> Result<F, DriverError> {
// setup GPU device
let now = Instant::now();
let gpu = CudaDevice::new(0)?;
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
// compile ptx
let now = Instant::now();
let ptx = Ptx::from_src(MULTILINEAR_POLY_KERNEL);
gpu.load_ptx(ptx, "multilinear", &["evaluate", "evaluate_optimized"])?;
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
let now = Instant::now();
let point_montgomery = point
.into_iter()
@@ -54,41 +69,34 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
.collect_vec();
// copy to GPU
let gpu_coeffs = gpu.htod_copy(
let gpu_coeffs = self.gpu.htod_copy(
poly_coeffs
.into_par_iter()
.map(|&coeff| F::to_canonical_form(coeff))
.collect(),
)?;
let gpu_eval_point = gpu.htod_copy(point_montgomery)?;
let monomial_evals = gpu.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
let mutex = gpu.alloc_zeros::<u32>(1)?;
let result = gpu.htod_copy(vec![FieldBinding::default(); 1])?;
let gpu_eval_point = self.gpu.htod_copy(point_montgomery)?;
let monomial_evals = self
.gpu
.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
println!("Time taken to initialise data: {:.2?}", now.elapsed());
let now = Instant::now();
let evaluate_optimized = gpu.get_func("multilinear", "evaluate_optimized").unwrap();
let evaluate_optimized = self
.gpu
.get_func("multilinear", "evaluate_optimized")
.unwrap();
unsafe {
evaluate_optimized.launch(
LaunchConfig::for_num_elems(1 << num_vars as u32),
(
&gpu_coeffs,
&gpu_eval_point,
num_vars,
&monomial_evals,
&result,
&mutex,
),
(&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
)?;
};
println!("Time taken to call kernel: {:.2?}", now.elapsed());
let now = Instant::now();
let monomial_evals = gpu.sync_reclaim(monomial_evals)?;
let monomial_evals = self.gpu.sync_reclaim(monomial_evals)?;
println!("Time taken to synchronize: {:.2?}", now.elapsed());
let now = Instant::now();
@@ -100,27 +108,48 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
println!("Time taken to calculate sum: {:.2?}", now.elapsed());
Ok(result)
}
pub fn mul(&self, values: &[F; 2]) -> Result<F, DriverError> {
let now = Instant::now();
let gpu_values = self
.gpu
.htod_copy(values.map(|v| F::to_montgomery_form(v)).to_vec())?;
let results = self.gpu.htod_copy(vec![FieldBinding::default(); 1])?;
println!("Time taken to initialise data: {:.2?}", now.elapsed());
let now = Instant::now();
let mul = self.gpu.get_func("scalar_multiplication", "mul").unwrap();
unsafe {
mul.launch(
LaunchConfig::for_num_elems(1 as u32),
(&gpu_values, &results)
)?;
}
println!("Time taken to call kernel: {:.2?}", now.elapsed());
let now = Instant::now();
let results = self.gpu.sync_reclaim(results)?;
println!("Time taken to synchronize: {:.2?}", now.elapsed());
Ok(F::from_canonical_form(results[0]))
}
}
#[cfg(test)]
mod tests {
use std::{cmp::Ordering, default, fmt::Error, time::Instant};
use std::time::Instant;
use cudarc::{
driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig},
nvrtc::Ptx,
};
use cudarc::driver::DriverError;
use ff::{Field, PrimeField};
use halo2curves::bn256::Fr;
use itertools::Itertools;
use rand::rngs::OsRng;
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
use crate::{field::{FromFieldBinding, ToFieldBinding}, FieldBinding, MULTILINEAR_POLY_KERNEL};
use crate::{MULTILINEAR_POLY_KERNEL, SCALAR_MULTIPLICATION_KERNEL};
use super::GPUApiWrapper;
fn evaluate_poly_cpu<F: Field>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
fn evaluate_poly_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
poly_coeffs
.par_iter()
.enumerate()
@@ -141,11 +170,18 @@ mod tests {
#[test]
fn test_evaluate_poly() -> Result<(), DriverError> {
let num_vars = 22;
let num_vars = 18;
let rng = OsRng::default();
let poly_coeffs = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
let gpu_api_wrapper = GPUApiWrapper::<Fr>::default();
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
gpu_api_wrapper.load_ptx(
MULTILINEAR_POLY_KERNEL,
"multilinear",
&["evaluate_optimized"],
)?;
let now = Instant::now();
let eval_poly_result_by_cpu = evaluate_poly_cpu(&poly_coeffs, &point, num_vars);
println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
@@ -161,64 +197,22 @@ mod tests {
#[test]
fn test_scalar_multiplication() -> Result<(), DriverError> {
// setup GPU device
let now = Instant::now();
let rng = OsRng::default();
let values = [(); 2].map(|_| Fr::random(rng));
let expected = values[0] * values[1];
let gpu = CudaDevice::new(0)?;
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
// compile ptx
let now = Instant::now();
let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
gpu.load_ptx(ptx, "my_module", &["mul"])?;
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
let a = Fr::from(2);
let b = Fr::TWO_INV;
println!("a * b : {:?}", a * b);
let a_data = FieldBinding { data: [2, 0, 0, 0] };
let b_data = FieldBinding {
data: [
0xa1f0fac9f8000001,
0x9419f4243cdcb848,
0xdc2822db40c0ac2e,
0x183227397098d014,
],
};
// copy to GPU
let gpu_field_structs = gpu.htod_copy(vec![a_data, b_data])?;
let results = gpu.htod_copy(vec![FieldBinding::default(); 1024])?;
println!("Time taken to initialise data: {:.2?}", now.elapsed());
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
gpu_api_wrapper.load_ptx(
SCALAR_MULTIPLICATION_KERNEL,
"scalar_multiplication",
&["mul"],
)?;
let now = Instant::now();
let actual = gpu_api_wrapper.mul(&values)?;
println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());
let f = gpu.get_func("my_module", "mul").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1024 as u32),
(&gpu_field_structs, &results),
)
}?;
println!("Time taken to call kernel: {:.2?}", now.elapsed());
let results = gpu.sync_reclaim(results)?;
results.iter().for_each(|result| {
assert_eq!(result.data[0], 1);
assert_eq!(result.data[1], 0);
assert_eq!(result.data[1], 0);
assert_eq!(result.data[1], 0);
});
assert_eq!(actual, expected);
Ok(())
}
}