mirror of
https://github.com/pseXperiments/cuda-sumcheck.git
synced 2026-01-08 23:18:00 -05:00
Cleanup
This commit is contained in:
@@ -10,6 +10,7 @@ fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever files of interest changes.
|
||||
println!("cargo:rerun-if-changed=src/cuda/kernels/multilinear.cu");
|
||||
println!("cargo:rerun-if-changed=src/cuda/kernels/sumcheck.cu");
|
||||
println!("cargo:rerun-if-changed=src/cuda/kernels/scalar_multiplication.cu");
|
||||
|
||||
let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
|
||||
@@ -20,27 +21,33 @@ fn main() {
|
||||
let language_std = "c++20"; // Language standard in which device functions are written
|
||||
|
||||
// build the cuda kernels
|
||||
let cuda_src = PathBuf::from("src/cuda/kernels/multilinear.cu");
|
||||
let ptx_file = out_dir.join("multilinear.ptx");
|
||||
let cuda_src = [
|
||||
"src/cuda/kernels/multilinear.cu",
|
||||
"src/cuda/kernels/scalar_multiplication.cu",
|
||||
]
|
||||
.map(|path| PathBuf::from(path));
|
||||
let ptx_file = ["multilinear.ptx", "scalar_multiplication.ptx"].map(|file| out_dir.join(file));
|
||||
|
||||
let nvcc_status = Command::new("nvcc")
|
||||
.arg("-ptx")
|
||||
.arg("-o")
|
||||
.arg(&ptx_file)
|
||||
.arg(&cuda_src)
|
||||
.arg(format!("-arch={}", arch))
|
||||
.arg(format!("-code={}", code))
|
||||
.arg(format!("-ccbin={}", compiler))
|
||||
.arg(format!("-std={}", language_std))
|
||||
.arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
|
||||
.arg("--expt-relaxed-constexpr")
|
||||
.status()
|
||||
.unwrap();
|
||||
for (cuda_src, ptx_file) in cuda_src.into_iter().zip(ptx_file) {
|
||||
let nvcc_status = Command::new("nvcc")
|
||||
.arg("-ptx")
|
||||
.arg("-o")
|
||||
.arg(&ptx_file)
|
||||
.arg(&cuda_src)
|
||||
.arg(format!("-arch={}", arch))
|
||||
.arg(format!("-code={}", code))
|
||||
.arg(format!("-ccbin={}", compiler))
|
||||
.arg(format!("-std={}", language_std))
|
||||
.arg("-allow-unsupported-compiler") // workaround to use clang-16 compiler with nvcc
|
||||
.arg("--expt-relaxed-constexpr")
|
||||
.status()
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
nvcc_status.success(),
|
||||
"Failed to compile CUDA source to PTX."
|
||||
);
|
||||
assert!(
|
||||
nvcc_status.success(),
|
||||
"Failed to compile CUDA source to PTX."
|
||||
);
|
||||
}
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#include "prime_field.h"
|
||||
#include "./barretenberg/ecc/curves/bn254/fq.hpp"
|
||||
|
||||
int main() {
|
||||
bb::fq a = bb::fq(1UL);
|
||||
bb::fq b = bb::fq(2U);
|
||||
bb::fq c = a + b;
|
||||
assert(c == bb::fq(3U));
|
||||
c = a * b;
|
||||
assert(c == bb::fq(2U));
|
||||
// memory layout test
|
||||
struct Field val_c { { 1UL, 2UL, 3UL, 4UL } };
|
||||
bb::fq val;
|
||||
std::memcpy(&val, &val_c, 32);
|
||||
assert(val.data[0] == 1UL);
|
||||
assert(val.data[1] == 2UL);
|
||||
assert(val.data[2] == 3UL);
|
||||
assert(val.data[3] == 4UL);
|
||||
std::cout << "test ended" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
@@ -19,7 +19,7 @@ extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr*
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals, fr* result, int* mutex) {
|
||||
extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
|
||||
const int tid = threadIdx.x;
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
using namespace bb;
|
||||
|
||||
extern "C" __global__ void mul(fr* elems, fr* results) {
|
||||
elems[0].self_to_montgomery_form();
|
||||
elems[1].self_to_montgomery_form();
|
||||
fr temp = elems[0] * elems[1];
|
||||
results[threadIdx.x] = temp.from_montgomery_form();
|
||||
return;
|
||||
|
||||
@@ -5,6 +5,7 @@ use field::{FromFieldBinding, ToFieldBinding};
|
||||
use itertools::Itertools;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
pub mod field;
|
||||
@@ -20,33 +21,47 @@ impl Default for FieldBinding {
|
||||
|
||||
// include the compiled PTX code as string
|
||||
const MULTILINEAR_POLY_KERNEL: &str = include_str!(concat!(env!("OUT_DIR"), "/multilinear.ptx"));
|
||||
const SCALAR_MULTIPLICATION_KERNEL: &str =
|
||||
include_str!(concat!(env!("OUT_DIR"), "/scalar_multiplication.ptx"));
|
||||
|
||||
/// Wrapper struct for APIs using GPU
|
||||
#[derive(Default)]
|
||||
pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>>(PhantomData<F>);
|
||||
pub struct GPUApiWrapper<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> {
|
||||
gpu: Arc<CudaDevice>,
|
||||
_marker: PhantomData<F>,
|
||||
}
|
||||
|
||||
impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
||||
pub fn setup() -> Result<Self, DriverError> {
|
||||
// setup GPU device
|
||||
let now = Instant::now();
|
||||
let gpu = CudaDevice::new(0)?;
|
||||
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
|
||||
Ok(Self {
|
||||
gpu,
|
||||
_marker: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load_ptx(
|
||||
&self,
|
||||
ptx: &str,
|
||||
module_name: &str,
|
||||
func_names: &[&'static str],
|
||||
) -> Result<(), DriverError> {
|
||||
// compile ptx
|
||||
let now = Instant::now();
|
||||
let ptx = Ptx::from_src(ptx);
|
||||
self.gpu.load_ptx(ptx, module_name, &func_names)?;
|
||||
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn evaluate_poly(
|
||||
&self,
|
||||
num_vars: usize,
|
||||
poly_coeffs: &[F],
|
||||
point: &[F],
|
||||
) -> Result<F, DriverError> {
|
||||
// setup GPU device
|
||||
let now = Instant::now();
|
||||
|
||||
let gpu = CudaDevice::new(0)?;
|
||||
|
||||
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
|
||||
|
||||
// compile ptx
|
||||
let now = Instant::now();
|
||||
|
||||
let ptx = Ptx::from_src(MULTILINEAR_POLY_KERNEL);
|
||||
gpu.load_ptx(ptx, "multilinear", &["evaluate", "evaluate_optimized"])?;
|
||||
|
||||
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let point_montgomery = point
|
||||
.into_iter()
|
||||
@@ -54,41 +69,34 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
||||
.collect_vec();
|
||||
|
||||
// copy to GPU
|
||||
let gpu_coeffs = gpu.htod_copy(
|
||||
let gpu_coeffs = self.gpu.htod_copy(
|
||||
poly_coeffs
|
||||
.into_par_iter()
|
||||
.map(|&coeff| F::to_canonical_form(coeff))
|
||||
.collect(),
|
||||
)?;
|
||||
let gpu_eval_point = gpu.htod_copy(point_montgomery)?;
|
||||
let monomial_evals = gpu.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
|
||||
let mutex = gpu.alloc_zeros::<u32>(1)?;
|
||||
let result = gpu.htod_copy(vec![FieldBinding::default(); 1])?;
|
||||
|
||||
let gpu_eval_point = self.gpu.htod_copy(point_montgomery)?;
|
||||
let monomial_evals = self
|
||||
.gpu
|
||||
.htod_copy(vec![FieldBinding::default(); 1 << num_vars])?;
|
||||
println!("Time taken to initialise data: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
|
||||
let evaluate_optimized = gpu.get_func("multilinear", "evaluate_optimized").unwrap();
|
||||
let evaluate_optimized = self
|
||||
.gpu
|
||||
.get_func("multilinear", "evaluate_optimized")
|
||||
.unwrap();
|
||||
|
||||
unsafe {
|
||||
evaluate_optimized.launch(
|
||||
LaunchConfig::for_num_elems(1 << num_vars as u32),
|
||||
(
|
||||
&gpu_coeffs,
|
||||
&gpu_eval_point,
|
||||
num_vars,
|
||||
&monomial_evals,
|
||||
&result,
|
||||
&mutex,
|
||||
),
|
||||
(&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
|
||||
)?;
|
||||
};
|
||||
|
||||
println!("Time taken to call kernel: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let monomial_evals = gpu.sync_reclaim(monomial_evals)?;
|
||||
let monomial_evals = self.gpu.sync_reclaim(monomial_evals)?;
|
||||
println!("Time taken to synchronize: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
@@ -100,27 +108,48 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
||||
println!("Time taken to calculate sum: {:.2?}", now.elapsed());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn mul(&self, values: &[F; 2]) -> Result<F, DriverError> {
|
||||
let now = Instant::now();
|
||||
let gpu_values = self
|
||||
.gpu
|
||||
.htod_copy(values.map(|v| F::to_montgomery_form(v)).to_vec())?;
|
||||
let results = self.gpu.htod_copy(vec![FieldBinding::default(); 1])?;
|
||||
println!("Time taken to initialise data: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let mul = self.gpu.get_func("scalar_multiplication", "mul").unwrap();
|
||||
unsafe {
|
||||
mul.launch(
|
||||
LaunchConfig::for_num_elems(1 as u32),
|
||||
(&gpu_values, &results)
|
||||
)?;
|
||||
}
|
||||
println!("Time taken to call kernel: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let results = self.gpu.sync_reclaim(results)?;
|
||||
println!("Time taken to synchronize: {:.2?}", now.elapsed());
|
||||
Ok(F::from_canonical_form(results[0]))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{cmp::Ordering, default, fmt::Error, time::Instant};
|
||||
use std::time::Instant;
|
||||
|
||||
use cudarc::{
|
||||
driver::{CudaDevice, DriverError, LaunchAsync, LaunchConfig},
|
||||
nvrtc::Ptx,
|
||||
};
|
||||
use cudarc::driver::DriverError;
|
||||
use ff::{Field, PrimeField};
|
||||
use halo2curves::bn256::Fr;
|
||||
use itertools::Itertools;
|
||||
use rand::rngs::OsRng;
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
|
||||
|
||||
use crate::{field::{FromFieldBinding, ToFieldBinding}, FieldBinding, MULTILINEAR_POLY_KERNEL};
|
||||
use crate::{MULTILINEAR_POLY_KERNEL, SCALAR_MULTIPLICATION_KERNEL};
|
||||
|
||||
use super::GPUApiWrapper;
|
||||
|
||||
fn evaluate_poly_cpu<F: Field>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
|
||||
fn evaluate_poly_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
|
||||
poly_coeffs
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
@@ -141,11 +170,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_poly() -> Result<(), DriverError> {
|
||||
let num_vars = 22;
|
||||
let num_vars = 18;
|
||||
let rng = OsRng::default();
|
||||
let poly_coeffs = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
|
||||
let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
|
||||
let gpu_api_wrapper = GPUApiWrapper::<Fr>::default();
|
||||
|
||||
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
|
||||
gpu_api_wrapper.load_ptx(
|
||||
MULTILINEAR_POLY_KERNEL,
|
||||
"multilinear",
|
||||
&["evaluate_optimized"],
|
||||
)?;
|
||||
|
||||
let now = Instant::now();
|
||||
let eval_poly_result_by_cpu = evaluate_poly_cpu(&poly_coeffs, &point, num_vars);
|
||||
println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
|
||||
@@ -161,64 +197,22 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_scalar_multiplication() -> Result<(), DriverError> {
|
||||
// setup GPU device
|
||||
let now = Instant::now();
|
||||
let rng = OsRng::default();
|
||||
let values = [(); 2].map(|_| Fr::random(rng));
|
||||
let expected = values[0] * values[1];
|
||||
|
||||
let gpu = CudaDevice::new(0)?;
|
||||
|
||||
println!("Time taken to initialise CUDA: {:.2?}", now.elapsed());
|
||||
|
||||
// compile ptx
|
||||
let now = Instant::now();
|
||||
|
||||
let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
|
||||
gpu.load_ptx(ptx, "my_module", &["mul"])?;
|
||||
|
||||
println!("Time taken to compile and load PTX: {:.2?}", now.elapsed());
|
||||
|
||||
let a = Fr::from(2);
|
||||
let b = Fr::TWO_INV;
|
||||
|
||||
println!("a * b : {:?}", a * b);
|
||||
|
||||
let a_data = FieldBinding { data: [2, 0, 0, 0] };
|
||||
|
||||
let b_data = FieldBinding {
|
||||
data: [
|
||||
0xa1f0fac9f8000001,
|
||||
0x9419f4243cdcb848,
|
||||
0xdc2822db40c0ac2e,
|
||||
0x183227397098d014,
|
||||
],
|
||||
};
|
||||
|
||||
// copy to GPU
|
||||
let gpu_field_structs = gpu.htod_copy(vec![a_data, b_data])?;
|
||||
let results = gpu.htod_copy(vec![FieldBinding::default(); 1024])?;
|
||||
|
||||
println!("Time taken to initialise data: {:.2?}", now.elapsed());
|
||||
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
|
||||
gpu_api_wrapper.load_ptx(
|
||||
SCALAR_MULTIPLICATION_KERNEL,
|
||||
"scalar_multiplication",
|
||||
&["mul"],
|
||||
)?;
|
||||
|
||||
let now = Instant::now();
|
||||
let actual = gpu_api_wrapper.mul(&values)?;
|
||||
println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());
|
||||
|
||||
let f = gpu.get_func("my_module", "mul").unwrap();
|
||||
|
||||
unsafe {
|
||||
f.launch(
|
||||
LaunchConfig::for_num_elems(1024 as u32),
|
||||
(&gpu_field_structs, &results),
|
||||
)
|
||||
}?;
|
||||
|
||||
println!("Time taken to call kernel: {:.2?}", now.elapsed());
|
||||
|
||||
let results = gpu.sync_reclaim(results)?;
|
||||
|
||||
results.iter().for_each(|result| {
|
||||
assert_eq!(result.data[0], 1);
|
||||
assert_eq!(result.data[1], 0);
|
||||
assert_eq!(result.data[1], 0);
|
||||
assert_eq!(result.data[1], 0);
|
||||
});
|
||||
assert_eq!(actual, expected);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user