mirror of
https://github.com/pseXperiments/cuda-sumcheck.git
synced 2026-01-09 23:47:57 -05:00
Cleanup
This commit is contained in:
@@ -39,11 +39,11 @@ extern "C" __global__ void combine_and_sum(
|
|||||||
extern "C" __global__ void fold_into_half(
|
extern "C" __global__ void fold_into_half(
|
||||||
unsigned int num_vars, unsigned int initial_poly_size, unsigned int num_blocks_per_poly, fr* polys, fr* buf, fr* challenge
|
unsigned int num_vars, unsigned int initial_poly_size, unsigned int num_blocks_per_poly, fr* polys, fr* buf, fr* challenge
|
||||||
) {
|
) {
|
||||||
int tid = threadIdx.x;
|
int tid = (blockIdx.x % num_blocks_per_poly) * blockDim.x + threadIdx.x;
|
||||||
const int stride = 1 << (num_vars - 1);
|
const int stride = 1 << (num_vars - 1);
|
||||||
const int buf_offset = (blockIdx.x / num_blocks_per_poly) * stride + (blockIdx.x % num_blocks_per_poly) * blockDim.x;
|
const int buf_offset = (blockIdx.x / num_blocks_per_poly) * stride;
|
||||||
const int poly_offset = (blockIdx.x / num_blocks_per_poly) * initial_poly_size + (blockIdx.x % num_blocks_per_poly) * blockDim.x;
|
const int poly_offset = (blockIdx.x / num_blocks_per_poly) * initial_poly_size;
|
||||||
while (tid < stride && buf_offset < (blockIdx.x / num_blocks_per_poly + 1) * stride) {
|
while (tid < stride) {
|
||||||
buf[buf_offset + tid] = (*challenge) * (polys[poly_offset + tid + stride] - polys[poly_offset + tid]) + polys[poly_offset + tid];
|
buf[buf_offset + tid] = (*challenge) * (polys[poly_offset + tid + stride] - polys[poly_offset + tid]) + polys[poly_offset + tid];
|
||||||
tid += blockDim.x * num_blocks_per_poly;
|
tid += blockDim.x * num_blocks_per_poly;
|
||||||
}
|
}
|
||||||
@@ -52,9 +52,9 @@ extern "C" __global__ void fold_into_half(
|
|||||||
extern "C" __global__ void fold_into_half_in_place(
|
extern "C" __global__ void fold_into_half_in_place(
|
||||||
unsigned int num_vars, unsigned int initial_poly_size, unsigned int num_blocks_per_poly, fr* polys, fr* challenge
|
unsigned int num_vars, unsigned int initial_poly_size, unsigned int num_blocks_per_poly, fr* polys, fr* challenge
|
||||||
) {
|
) {
|
||||||
int tid = threadIdx.x;
|
int tid = (blockIdx.x % num_blocks_per_poly) * blockDim.x + threadIdx.x;
|
||||||
const int stride = 1 << (num_vars - 1);
|
const int stride = 1 << (num_vars - 1);
|
||||||
const int offset = (blockIdx.x / num_blocks_per_poly) * initial_poly_size + (blockIdx.x % num_blocks_per_poly) * blockDim.x;
|
const int offset = (blockIdx.x / num_blocks_per_poly) * initial_poly_size;
|
||||||
while (tid < stride) {
|
while (tid < stride) {
|
||||||
int idx = offset + tid;
|
int idx = offset + tid;
|
||||||
polys[idx] = (*challenge) * (polys[idx + stride] - polys[idx]) + polys[idx];
|
polys[idx] = (*challenge) * (polys[idx + stride] - polys[idx]) + polys[idx];
|
||||||
|
|||||||
@@ -55,7 +55,8 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
|||||||
mut buf: RefMut<CudaViewMut<FieldBinding>>,
|
mut buf: RefMut<CudaViewMut<FieldBinding>>,
|
||||||
mut round_evals: RefMut<CudaViewMut<FieldBinding>>,
|
mut round_evals: RefMut<CudaViewMut<FieldBinding>>,
|
||||||
) -> Result<(), DriverError> {
|
) -> Result<(), DriverError> {
|
||||||
let num_blocks_per_poly = self.max_blocks_per_sm()?;
|
let num_blocks_per_poly = self.max_blocks_per_sm()? / num_polys * self.num_sm()?;
|
||||||
|
let num_threads_per_block = 1024;
|
||||||
for k in 0..max_degree + 1 {
|
for k in 0..max_degree + 1 {
|
||||||
let device_k = self
|
let device_k = self
|
||||||
.gpu
|
.gpu
|
||||||
@@ -63,7 +64,7 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
|||||||
let fold_into_half = self.gpu.get_func("sumcheck", "fold_into_half").unwrap();
|
let fold_into_half = self.gpu.get_func("sumcheck", "fold_into_half").unwrap();
|
||||||
let launch_config = LaunchConfig {
|
let launch_config = LaunchConfig {
|
||||||
grid_dim: ((num_blocks_per_poly * num_polys) as u32, 1, 1),
|
grid_dim: ((num_blocks_per_poly * num_polys) as u32, 1, 1),
|
||||||
block_dim: (1024, 1, 1),
|
block_dim: (num_threads_per_block as u32, 1, 1),
|
||||||
shared_mem_bytes: 0,
|
shared_mem_bytes: 0,
|
||||||
};
|
};
|
||||||
unsafe {
|
unsafe {
|
||||||
@@ -130,10 +131,11 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
|||||||
.gpu
|
.gpu
|
||||||
.get_func("sumcheck", "fold_into_half_in_place")
|
.get_func("sumcheck", "fold_into_half_in_place")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let num_blocks_per_poly = self.max_blocks_per_sm()?;
|
let num_blocks_per_poly = self.max_blocks_per_sm()? / num_polys * self.num_sm()?;
|
||||||
|
let num_threads_per_block = 1024;
|
||||||
let launch_config = LaunchConfig {
|
let launch_config = LaunchConfig {
|
||||||
grid_dim: ((num_blocks_per_poly * num_polys) as u32, 1, 1),
|
grid_dim: ((num_blocks_per_poly * num_polys) as u32, 1, 1),
|
||||||
block_dim: (1024, 1, 1),
|
block_dim: (num_threads_per_block as u32, 1, 1),
|
||||||
shared_mem_bytes: 0,
|
shared_mem_bytes: 0,
|
||||||
};
|
};
|
||||||
unsafe {
|
unsafe {
|
||||||
@@ -171,7 +173,7 @@ mod tests {
|
|||||||
let max_degree = 4;
|
let max_degree = 4;
|
||||||
let rng = OsRng::default();
|
let rng = OsRng::default();
|
||||||
|
|
||||||
let combine_function = |args: &Vec<Fr>| args.iter().product();
|
let combine_function = |args: &Vec<Fr>| args.iter().sum();
|
||||||
|
|
||||||
let polys = (0..num_polys)
|
let polys = (0..num_polys)
|
||||||
.map(|_| (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec())
|
.map(|_| (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec())
|
||||||
@@ -207,10 +209,9 @@ mod tests {
|
|||||||
|
|
||||||
// copy polynomials to device
|
// copy polynomials to device
|
||||||
let gpu_polys = gpu_api_wrapper.copy_to_device(&polys.concat())?;
|
let gpu_polys = gpu_api_wrapper.copy_to_device(&polys.concat())?;
|
||||||
let mut buf = gpu_api_wrapper.copy_to_device(&vec![Fr::ZERO; num_polys << num_vars])?;
|
let mut buf = gpu_api_wrapper.malloc_on_device(num_polys << (num_vars - 1))?;
|
||||||
let buf_view = RefCell::new(buf.slice_mut(..));
|
let buf_view = RefCell::new(buf.slice_mut(..));
|
||||||
let mut round_evals =
|
let mut round_evals = gpu_api_wrapper.malloc_on_device(max_degree as usize + 1)?;
|
||||||
gpu_api_wrapper.copy_to_device(&vec![Fr::ZERO; max_degree as usize + 1])?;
|
|
||||||
let round_evals_view = RefCell::new(round_evals.slice_mut(..));
|
let round_evals_view = RefCell::new(round_evals.slice_mut(..));
|
||||||
let round = 0;
|
let round = 0;
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
@@ -228,7 +229,7 @@ mod tests {
|
|||||||
"Time taken to eval_at_k_and_combine on gpu: {:.2?}",
|
"Time taken to eval_at_k_and_combine on gpu: {:.2?}",
|
||||||
now.elapsed()
|
now.elapsed()
|
||||||
);
|
);
|
||||||
let gpu_round_evals = gpu_api_wrapper.dtoh_sync_copy(round_evals.slice(..), true)?;
|
let gpu_round_evals = gpu_api_wrapper.dtoh_sync_copy(round_evals.slice(0..(max_degree + 1) as usize), true)?;
|
||||||
cpu_round_evals
|
cpu_round_evals
|
||||||
.iter()
|
.iter()
|
||||||
.zip_eq(gpu_round_evals.iter())
|
.zip_eq(gpu_round_evals.iter())
|
||||||
@@ -241,7 +242,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fold_into_half_in_place() -> Result<(), DriverError> {
|
fn test_fold_into_half_in_place() -> Result<(), DriverError> {
|
||||||
let num_vars = 20;
|
let num_vars = 15;
|
||||||
let num_polys = 4;
|
let num_polys = 4;
|
||||||
|
|
||||||
let rng = OsRng::default();
|
let rng = OsRng::default();
|
||||||
@@ -310,8 +311,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_prove_sumcheck() -> Result<(), DriverError> {
|
fn test_prove_sumcheck() -> Result<(), DriverError> {
|
||||||
let num_vars = 19;
|
let num_vars = 19;
|
||||||
let num_polys = 9;
|
let num_polys = 4;
|
||||||
let max_degree = 1;
|
let max_degree = 4;
|
||||||
|
|
||||||
let rng = OsRng::default();
|
let rng = OsRng::default();
|
||||||
let polys = (0..num_polys)
|
let polys = (0..num_polys)
|
||||||
|
|||||||
@@ -109,6 +109,10 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
|||||||
Ok(self.gpu.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)? as usize)
|
Ok(self.gpu.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)? as usize)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn num_sm(&self) -> Result<usize, DriverError> {
|
||||||
|
Ok(self.gpu.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)? as usize)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn max_threads_per_sm(&self) -> Result<usize, DriverError> {
|
pub fn max_threads_per_sm(&self) -> Result<usize, DriverError> {
|
||||||
Ok(self.gpu.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR)? as usize)
|
Ok(self.gpu.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR)? as usize)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user