mirror of
https://github.com/pseXperiments/cuda-sumcheck.git
synced 2026-01-08 23:18:00 -05:00
Add kernel for evaluating multilinear polynomial by evaluation form (only works for single thread block)
This commit is contained in:
@@ -3,31 +3,15 @@
|
||||
|
||||
using namespace bb;
|
||||
|
||||
extern "C" __global__ void evaluate(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
fr coeff = coeffs[index].to_montgomery_form();
|
||||
if (coeff == fr::zero()) {
|
||||
monomial_evals[index] = fr::zero();
|
||||
} else {
|
||||
monomial_evals[index] = coeff;
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
if (((index >> i) & 1) == 1) {
|
||||
monomial_evals[index] *= point[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
|
||||
extern "C" __global__ void eval_by_coeff(fr* coeffs, fr* point, uint8_t num_vars, fr* monomial_evals) {
|
||||
const int tid = threadIdx.x;
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
auto step_size = 1;
|
||||
int number_of_threads = blockDim.x >> 1;
|
||||
int num_threads = blockDim.x >> 1;
|
||||
bool evaluated = false;
|
||||
|
||||
while (number_of_threads > 0)
|
||||
while (num_threads > 0)
|
||||
{
|
||||
if (!evaluated) {
|
||||
fr coeff = coeffs[idx].to_montgomery_form();
|
||||
@@ -40,7 +24,7 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tid < number_of_threads) // still alive?
|
||||
if (tid < num_threads) // still alive?
|
||||
{
|
||||
const auto fst = blockIdx.x * blockDim.x + tid * step_size * 2;
|
||||
const auto snd = fst + step_size;
|
||||
@@ -48,10 +32,31 @@ extern "C" __global__ void evaluate_optimized(fr* coeffs, fr* point, uint8_t num
|
||||
}
|
||||
|
||||
step_size <<= 1;
|
||||
number_of_threads >>= 1;
|
||||
num_threads >>= 1;
|
||||
__syncthreads();
|
||||
}
|
||||
if (tid == 0) {
|
||||
monomial_evals[idx].self_from_montgomery_form();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void eval(fr* evals, fr* point, uint8_t num_vars, fr* buf) {
|
||||
const int tid = threadIdx.x;
|
||||
auto i = 0;
|
||||
auto num_threads = 1 << (num_vars - 1);
|
||||
while (num_threads > 0) {
|
||||
if (tid < num_threads) {
|
||||
buf[tid] = point[i] * (evals[2 * tid + 1] - evals[2 * tid]) + evals[2 * tid];
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid == 0) {
|
||||
memcpy(evals, buf, num_threads * 32);
|
||||
}
|
||||
i++;
|
||||
num_threads >>= 1;
|
||||
__syncthreads();
|
||||
}
|
||||
if (tid == 0) {
|
||||
buf[0].self_from_montgomery_form();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,44 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn eval(&self, num_vars: usize, evals: &[F], point: &[F]) -> Result<F, DriverError> {
|
||||
let now = Instant::now();
|
||||
let point = point
|
||||
.into_iter()
|
||||
.map(|f| F::to_montgomery_form(*f))
|
||||
.collect_vec();
|
||||
|
||||
// copy to GPU
|
||||
let evals = self.gpu.htod_copy(
|
||||
evals
|
||||
.into_par_iter()
|
||||
.map(|&eval| F::to_montgomery_form(eval))
|
||||
.collect(),
|
||||
)?;
|
||||
let gpu_eval_point = self.gpu.htod_copy(point)?;
|
||||
let buf = self
|
||||
.gpu
|
||||
.htod_copy(vec![FieldBinding::default(); 1 << (num_vars - 1)])?;
|
||||
println!("Time taken to initialise data: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let eval = self.gpu.get_func("multilinear", "eval").unwrap();
|
||||
|
||||
unsafe {
|
||||
eval.launch(
|
||||
LaunchConfig::for_num_elems(1 << (num_vars - 1) as u32),
|
||||
(&evals, &gpu_eval_point, num_vars, &buf),
|
||||
)?;
|
||||
};
|
||||
println!("Time taken to call kernel: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let buf = self.gpu.sync_reclaim(buf)?;
|
||||
println!("Time taken to synchronize: {:.2?}", now.elapsed());
|
||||
|
||||
Ok(F::from_canonical_form(buf[0]))
|
||||
}
|
||||
|
||||
pub fn eval_by_coeff(
|
||||
&self,
|
||||
num_vars: usize,
|
||||
@@ -86,13 +124,10 @@ impl<F: PrimeField + FromFieldBinding<F> + ToFieldBinding<F>> GPUApiWrapper<F> {
|
||||
println!("Time taken to initialise data: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let evaluate_optimized = self
|
||||
.gpu
|
||||
.get_func("multilinear", "evaluate_optimized")
|
||||
.unwrap();
|
||||
let eval_by_coeff = self.gpu.get_func("multilinear", "eval_by_coeff").unwrap();
|
||||
|
||||
unsafe {
|
||||
evaluate_optimized.launch(
|
||||
eval_by_coeff.launch(
|
||||
LaunchConfig::for_num_elems(1 << num_vars as u32),
|
||||
(&gpu_coeffs, &gpu_eval_point, num_vars, &monomial_evals),
|
||||
)?;
|
||||
@@ -153,10 +188,6 @@ mod tests {
|
||||
|
||||
use super::GPUApiWrapper;
|
||||
|
||||
fn eval_cpu<F: PrimeField>(evals: &[F], x: &[F]) -> F {
|
||||
cpu::multilinear::evaluate(evals, x)
|
||||
}
|
||||
|
||||
fn eval_by_coeff_cpu<F: PrimeField>(poly_coeffs: &[F], point: &[F], num_vars: usize) -> F {
|
||||
poly_coeffs
|
||||
.par_iter()
|
||||
@@ -178,7 +209,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_eval() -> Result<(), DriverError> {
|
||||
todo!()
|
||||
let num_vars = 10;
|
||||
let rng = OsRng::default();
|
||||
let evals = (0..1 << num_vars).map(|_| Fr::random(rng)).collect_vec();
|
||||
let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
|
||||
|
||||
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
|
||||
gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval"])?;
|
||||
|
||||
let now = Instant::now();
|
||||
let eval_poly_result_by_cpu = cpu::multilinear::evaluate(&evals, &point);
|
||||
println!("Time taken to evaluate on cpu: {:.2?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let eval_poly_result_by_gpu = gpu_api_wrapper.eval(num_vars, &evals, &point)?;
|
||||
println!("Time taken to evaluate on gpu: {:.2?}", now.elapsed());
|
||||
|
||||
assert_eq!(eval_poly_result_by_cpu, eval_poly_result_by_gpu);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -189,11 +237,7 @@ mod tests {
|
||||
let point = (0..num_vars).map(|_| Fr::random(rng)).collect_vec();
|
||||
|
||||
let gpu_api_wrapper = GPUApiWrapper::<Fr>::setup()?;
|
||||
gpu_api_wrapper.load_ptx(
|
||||
MULTILINEAR_POLY_KERNEL,
|
||||
"multilinear",
|
||||
&["evaluate_optimized"],
|
||||
)?;
|
||||
gpu_api_wrapper.load_ptx(MULTILINEAR_POLY_KERNEL, "multilinear", &["eval_by_coeff"])?;
|
||||
|
||||
let now = Instant::now();
|
||||
let eval_poly_result_by_cpu = eval_by_coeff_cpu(&poly_coeffs, &point, num_vars);
|
||||
|
||||
Reference in New Issue
Block a user