mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-07 22:53:56 -05:00
Add streams capability (#89)
This commit is contained in:
@@ -4,4 +4,4 @@ This PR...
|
||||
|
||||
## Linked Issues
|
||||
|
||||
Closes #
|
||||
Resolves #
|
||||
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
@@ -12,6 +12,7 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ARCH_TYPE: sm_70
|
||||
DEFAULT_STREAM: per-thread
|
||||
|
||||
jobs:
|
||||
build-linux:
|
||||
|
||||
4
build.rs
4
build.rs
@@ -8,9 +8,12 @@ fn main() {
|
||||
println!("cargo:rerun-if-changed=./icicle");
|
||||
|
||||
let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
|
||||
let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
|
||||
|
||||
let mut arch = String::from("-arch=");
|
||||
arch.push_str(&arch_type);
|
||||
let mut stream = String::from("-default-stream=");
|
||||
stream.push_str(&stream_type);
|
||||
|
||||
let mut nvcc = cc::Build::new();
|
||||
|
||||
@@ -22,6 +25,7 @@ fn main() {
|
||||
nvcc.cuda(true);
|
||||
nvcc.debug(false);
|
||||
nvcc.flag(&arch);
|
||||
nvcc.flag(&stream);
|
||||
nvcc.files([
|
||||
"./icicle/curves/index.cu",
|
||||
]);
|
||||
|
||||
@@ -88,7 +88,7 @@ template <typename P, typename A>
|
||||
__global__ void accumulate_buckets_kernel(P *buckets, unsigned *bucket_offsets, unsigned *bucket_sizes, unsigned *single_bucket_indices, unsigned *point_indices, A *points, unsigned nof_buckets, unsigned *nof_buckets_to_compute, unsigned msm_idx_shift){
|
||||
|
||||
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid>=*nof_buckets_to_compute){
|
||||
if (tid >= *nof_buckets_to_compute){
|
||||
return;
|
||||
}
|
||||
unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift;
|
||||
@@ -106,7 +106,7 @@ template <typename P>
|
||||
__global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c){
|
||||
|
||||
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid>=nof_bms) return;
|
||||
if (tid >= nof_bms) return;
|
||||
P line_sum = buckets[(tid+1)*(1<<c)-1];
|
||||
final_sums[tid] = line_sum;
|
||||
for (unsigned i = (1<<c)-2; i >0; i--)
|
||||
@@ -152,16 +152,16 @@ __global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsig
|
||||
|
||||
//this function computes msm using the bucket method
|
||||
template <typename S, typename P, typename A>
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device) {
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream) {
|
||||
|
||||
S *d_scalars;
|
||||
A *d_points;
|
||||
if (!on_device) {
|
||||
//copy scalars and point to gpu
|
||||
cudaMalloc(&d_scalars, sizeof(S) * size);
|
||||
cudaMalloc(&d_points, sizeof(A) * size);
|
||||
cudaMemcpy(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice);
|
||||
cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
|
||||
cudaMallocAsync(&d_points, sizeof(A) * size, stream);
|
||||
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
else {
|
||||
d_scalars = scalars;
|
||||
@@ -178,135 +178,140 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
|
||||
nof_bms++;
|
||||
}
|
||||
unsigned nof_buckets = nof_bms<<c;
|
||||
cudaMalloc(&buckets, sizeof(P) * nof_buckets);
|
||||
cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream);
|
||||
|
||||
// launch the bucket initialization kernel with maximum threads
|
||||
unsigned NUM_THREADS = 1 << 10;
|
||||
unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, nof_buckets);
|
||||
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, nof_buckets);
|
||||
|
||||
unsigned *bucket_indices;
|
||||
unsigned *point_indices;
|
||||
cudaMalloc(&bucket_indices, sizeof(unsigned) * size * (nof_bms+1));
|
||||
cudaMalloc(&point_indices, sizeof(unsigned) * size * (nof_bms+1));
|
||||
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms+1), stream);
|
||||
cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms+1), stream);
|
||||
|
||||
//split scalars into digits
|
||||
NUM_THREADS = 1 << 10;
|
||||
NUM_BLOCKS = (size * (nof_bms+1) + NUM_THREADS - 1) / NUM_THREADS;
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size,
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size,
|
||||
nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
|
||||
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
|
||||
unsigned *sort_indices_temp_storage{};
|
||||
size_t sort_indices_temp_storage_bytes;
|
||||
// The second to last parameter is the default value supplied explicitly to allow passing the stream
|
||||
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
|
||||
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices,
|
||||
point_indices + size, point_indices, size);
|
||||
|
||||
cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes);
|
||||
point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream);
|
||||
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
|
||||
for (unsigned i = 0; i < nof_bms; i++) {
|
||||
unsigned offset_out = i * size;
|
||||
unsigned offset_in = offset_out + size;
|
||||
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in,
|
||||
bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, size);
|
||||
// The second to last parameter is the default value supplied explicitly to allow passing the stream
|
||||
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
|
||||
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, bucket_indices + offset_out,
|
||||
point_indices + offset_in, point_indices + offset_out, size, 0, sizeof(unsigned) * 8, stream);
|
||||
}
|
||||
cudaFree(sort_indices_temp_storage);
|
||||
cudaFreeAsync(sort_indices_temp_storage, stream);
|
||||
|
||||
//find bucket_sizes
|
||||
unsigned *single_bucket_indices;
|
||||
unsigned *bucket_sizes;
|
||||
unsigned *nof_buckets_to_compute;
|
||||
cudaMalloc(&single_bucket_indices, sizeof(unsigned)*nof_buckets);
|
||||
cudaMalloc(&bucket_sizes, sizeof(unsigned)*nof_buckets);
|
||||
cudaMalloc(&nof_buckets_to_compute, sizeof(unsigned));
|
||||
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*nof_buckets, stream);
|
||||
cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*nof_buckets, stream);
|
||||
cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream);
|
||||
unsigned *encode_temp_storage{};
|
||||
size_t encode_temp_storage_bytes = 0;
|
||||
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
|
||||
nof_buckets_to_compute, nof_bms*size);
|
||||
cudaMalloc(&encode_temp_storage, encode_temp_storage_bytes);
|
||||
nof_buckets_to_compute, nof_bms*size, stream);
|
||||
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
|
||||
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
|
||||
nof_buckets_to_compute, nof_bms*size);
|
||||
cudaFree(encode_temp_storage);
|
||||
nof_buckets_to_compute, nof_bms*size, stream);
|
||||
cudaFreeAsync(encode_temp_storage, stream);
|
||||
|
||||
//get offsets - where does each new bucket begin
|
||||
unsigned* bucket_offsets;
|
||||
cudaMalloc(&bucket_offsets, sizeof(unsigned)*nof_buckets);
|
||||
cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*nof_buckets, stream);
|
||||
unsigned* offsets_temp_storage{};
|
||||
size_t offsets_temp_storage_bytes = 0;
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets);
|
||||
cudaMalloc(&offsets_temp_storage, offsets_temp_storage_bytes);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets);
|
||||
cudaFree(offsets_temp_storage);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
|
||||
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
|
||||
cudaFreeAsync(offsets_temp_storage, stream);
|
||||
|
||||
//launch the accumulation kernel with maximum threads
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices,
|
||||
d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize);
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices,
|
||||
d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize);
|
||||
|
||||
#ifdef SSM_SUM
|
||||
//sum each bucket
|
||||
NUM_THREADS = 1 << 10;
|
||||
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
ssm_buckets_kernel<fake_point, fake_scalar><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);
|
||||
ssm_buckets_kernel<fake_point, fake_scalar><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
|
||||
|
||||
//sum each bucket module
|
||||
P* final_results;
|
||||
cudaMalloc(&final_results, sizeof(P) * nof_bms);
|
||||
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
|
||||
NUM_THREADS = 1<<c;
|
||||
NUM_BLOCKS = nof_bms;
|
||||
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
|
||||
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
|
||||
#endif
|
||||
|
||||
#ifdef BIG_TRIANGLE
|
||||
P* final_results;
|
||||
cudaMalloc(&final_results, sizeof(P) * nof_bms);
|
||||
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
|
||||
//launch the bucket module sum kernel - a thread for each bucket module
|
||||
NUM_THREADS = nof_bms;
|
||||
NUM_BLOCKS = 1;
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, final_results, nof_bms, c);
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
|
||||
#endif
|
||||
|
||||
P* d_final_result;
|
||||
if (!on_device)
|
||||
cudaMalloc(&d_final_result, sizeof(P));
|
||||
cudaMallocAsync(&d_final_result, sizeof(P), stream);
|
||||
|
||||
//launch the double and add kernel, a single thread
|
||||
final_accumulation_kernel<P, S><<<1,1>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
|
||||
final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
|
||||
|
||||
//copy final result to host
|
||||
cudaDeviceSynchronize();
|
||||
cudaStreamSynchronize(stream);
|
||||
if (!on_device)
|
||||
cudaMemcpy(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
//free memory
|
||||
if (!on_device) {
|
||||
cudaFree(d_points);
|
||||
cudaFree(d_scalars);
|
||||
cudaFree(d_final_result);
|
||||
cudaFreeAsync(d_points, stream);
|
||||
cudaFreeAsync(d_scalars, stream);
|
||||
cudaFreeAsync(d_final_result, stream);
|
||||
}
|
||||
cudaFree(buckets);
|
||||
cudaFree(bucket_indices);
|
||||
cudaFree(point_indices);
|
||||
cudaFree(single_bucket_indices);
|
||||
cudaFree(bucket_sizes);
|
||||
cudaFree(nof_buckets_to_compute);
|
||||
cudaFree(bucket_offsets);
|
||||
cudaFree(final_results);
|
||||
cudaFreeAsync(buckets, stream);
|
||||
cudaFreeAsync(bucket_indices, stream);
|
||||
cudaFreeAsync(point_indices, stream);
|
||||
cudaFreeAsync(single_bucket_indices, stream);
|
||||
cudaFreeAsync(bucket_sizes, stream);
|
||||
cudaFreeAsync(nof_buckets_to_compute, stream);
|
||||
cudaFreeAsync(bucket_offsets, stream);
|
||||
cudaFreeAsync(final_results, stream);
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
//this function computes msm using the bucket method
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device){
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream){
|
||||
|
||||
unsigned total_size = batch_size * msm_size;
|
||||
S *d_scalars;
|
||||
A *d_points;
|
||||
if (!on_device) {
|
||||
//copy scalars and point to gpu
|
||||
cudaMalloc(&d_scalars, sizeof(S) * total_size);
|
||||
cudaMalloc(&d_points, sizeof(A) * total_size);
|
||||
cudaMemcpy(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice);
|
||||
cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream);
|
||||
cudaMallocAsync(&d_points, sizeof(A) * total_size, stream);
|
||||
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
else {
|
||||
d_scalars = scalars;
|
||||
@@ -323,125 +328,129 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
|
||||
unsigned bm_bitsize = ceil(log2(nof_bms));
|
||||
unsigned nof_buckets = (nof_bms<<c);
|
||||
unsigned total_nof_buckets = nof_buckets*batch_size;
|
||||
cudaMalloc(&buckets, sizeof(P) * total_nof_buckets);
|
||||
cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream);
|
||||
|
||||
//lanch the bucket initialization kernel with maximum threads
|
||||
unsigned NUM_THREADS = 1 << 10;
|
||||
unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, total_nof_buckets);
|
||||
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);
|
||||
|
||||
unsigned *bucket_indices;
|
||||
unsigned *point_indices;
|
||||
cudaMalloc(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size));
|
||||
cudaMalloc(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size));
|
||||
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
|
||||
cudaMallocAsync(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
|
||||
|
||||
//split scalars into digits
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size,
|
||||
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size,
|
||||
msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
|
||||
|
||||
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
|
||||
unsigned *sorted_bucket_indices;
|
||||
unsigned *sorted_point_indices;
|
||||
cudaMalloc(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms));
|
||||
cudaMalloc(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms));
|
||||
cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
|
||||
cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
|
||||
|
||||
unsigned *sort_indices_temp_storage{};
|
||||
size_t sort_indices_temp_storage_bytes;
|
||||
// The second to last parameter is the default value supplied explicitly to allow passing the stream
|
||||
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
|
||||
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
|
||||
point_indices + msm_size, sorted_point_indices, total_size * nof_bms);
|
||||
cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes);
|
||||
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream);
|
||||
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
|
||||
// The second to last parameter is the default value supplied explicitly to allow passing the stream
|
||||
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
|
||||
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
|
||||
point_indices + msm_size, sorted_point_indices, total_size * nof_bms);
|
||||
cudaFree(sort_indices_temp_storage);
|
||||
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream);
|
||||
cudaFreeAsync(sort_indices_temp_storage, stream);
|
||||
|
||||
//find bucket_sizes
|
||||
unsigned *single_bucket_indices;
|
||||
unsigned *bucket_sizes;
|
||||
unsigned *total_nof_buckets_to_compute;
|
||||
cudaMalloc(&single_bucket_indices, sizeof(unsigned)*total_nof_buckets);
|
||||
cudaMalloc(&bucket_sizes, sizeof(unsigned)*total_nof_buckets);
|
||||
cudaMalloc(&total_nof_buckets_to_compute, sizeof(unsigned));
|
||||
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*total_nof_buckets, stream);
|
||||
cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*total_nof_buckets, stream);
|
||||
cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream);
|
||||
unsigned *encode_temp_storage{};
|
||||
size_t encode_temp_storage_bytes = 0;
|
||||
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
|
||||
total_nof_buckets_to_compute, nof_bms*total_size);
|
||||
cudaMalloc(&encode_temp_storage, encode_temp_storage_bytes);
|
||||
total_nof_buckets_to_compute, nof_bms*total_size, stream);
|
||||
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
|
||||
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
|
||||
total_nof_buckets_to_compute, nof_bms*total_size);
|
||||
cudaFree(encode_temp_storage);
|
||||
total_nof_buckets_to_compute, nof_bms*total_size, stream);
|
||||
cudaFreeAsync(encode_temp_storage, stream);
|
||||
|
||||
//get offsets - where does each new bucket begin
|
||||
unsigned* bucket_offsets;
|
||||
cudaMalloc(&bucket_offsets, sizeof(unsigned)*total_nof_buckets);
|
||||
cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*total_nof_buckets, stream);
|
||||
unsigned* offsets_temp_storage{};
|
||||
size_t offsets_temp_storage_bytes = 0;
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets);
|
||||
cudaMalloc(&offsets_temp_storage, offsets_temp_storage_bytes);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets);
|
||||
cudaFree(offsets_temp_storage);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
|
||||
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
|
||||
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
|
||||
cudaFreeAsync(offsets_temp_storage, stream);
|
||||
|
||||
//launch the accumulation kernel with maximum threads
|
||||
NUM_THREADS = 1 << 8;
|
||||
NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices,
|
||||
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices,
|
||||
d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize);
|
||||
|
||||
#ifdef SSM_SUM
|
||||
//sum each bucket
|
||||
NUM_THREADS = 1 << 10;
|
||||
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
|
||||
ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);
|
||||
ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
|
||||
|
||||
//sum each bucket module
|
||||
P* final_results;
|
||||
cudaMalloc(&final_results, sizeof(P) * nof_bms);
|
||||
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
|
||||
NUM_THREADS = 1<<c;
|
||||
NUM_BLOCKS = nof_bms;
|
||||
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
|
||||
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
|
||||
#endif
|
||||
|
||||
#ifdef BIG_TRIANGLE
|
||||
P* bm_sums;
|
||||
cudaMalloc(&bm_sums, sizeof(P) * nof_bms * batch_size);
|
||||
cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream);
|
||||
//launch the bucket module sum kernel - a thread for each bucket module
|
||||
NUM_THREADS = 1<<8;
|
||||
NUM_BLOCKS = (nof_bms*batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, bm_sums, nof_bms*batch_size, c);
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bm_sums, nof_bms*batch_size, c);
|
||||
#endif
|
||||
|
||||
P* d_final_results;
|
||||
if (!on_device)
|
||||
cudaMalloc(&d_final_results, sizeof(P)*batch_size);
|
||||
cudaMallocAsync(&d_final_results, sizeof(P)*batch_size, stream);
|
||||
|
||||
//launch the double and add kernel, a single thread for each msm
|
||||
NUM_THREADS = 1<<8;
|
||||
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS>>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
|
||||
|
||||
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
|
||||
|
||||
//copy final result to host
|
||||
cudaDeviceSynchronize();
|
||||
if (!on_device)
|
||||
cudaMemcpy(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpyAsync(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
//free memory
|
||||
if (!on_device) {
|
||||
cudaFree(d_points);
|
||||
cudaFree(d_scalars);
|
||||
cudaFree(d_final_results);
|
||||
cudaFreeAsync(d_points, stream);
|
||||
cudaFreeAsync(d_scalars, stream);
|
||||
cudaFreeAsync(d_final_results, stream);
|
||||
}
|
||||
cudaFree(buckets);
|
||||
cudaFree(bucket_indices);
|
||||
cudaFree(point_indices);
|
||||
cudaFree(sorted_bucket_indices);
|
||||
cudaFree(sorted_point_indices);
|
||||
cudaFree(single_bucket_indices);
|
||||
cudaFree(bucket_sizes);
|
||||
cudaFree(total_nof_buckets_to_compute);
|
||||
cudaFree(bucket_offsets);
|
||||
cudaFree(bm_sums);
|
||||
cudaFreeAsync(buckets, stream);
|
||||
cudaFreeAsync(bucket_indices, stream);
|
||||
cudaFreeAsync(point_indices, stream);
|
||||
cudaFreeAsync(sorted_bucket_indices, stream);
|
||||
cudaFreeAsync(sorted_point_indices, stream);
|
||||
cudaFreeAsync(single_bucket_indices, stream);
|
||||
cudaFreeAsync(bucket_sizes, stream);
|
||||
cudaFreeAsync(total_nof_buckets_to_compute, stream);
|
||||
cudaFreeAsync(bucket_offsets, stream);
|
||||
cudaFreeAsync(bm_sums, stream);
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
|
||||
@@ -456,44 +465,44 @@ __global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N){
|
||||
|
||||
//the function computes msm using ssm
|
||||
template <typename S, typename P, typename A>
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result){ //works up to 2^8
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream){ //works up to 2^8
|
||||
S *scalars;
|
||||
A *a_points;
|
||||
P *p_points;
|
||||
P *results;
|
||||
|
||||
cudaMalloc(&scalars, sizeof(S) * size);
|
||||
cudaMalloc(&a_points, sizeof(A) * size);
|
||||
cudaMalloc(&p_points, sizeof(P) * size);
|
||||
cudaMalloc(&results, sizeof(P) * size);
|
||||
cudaMallocAsync(&scalars, sizeof(S) * size, stream);
|
||||
cudaMallocAsync(&a_points, sizeof(A) * size, stream);
|
||||
cudaMallocAsync(&p_points, sizeof(P) * size, stream);
|
||||
cudaMallocAsync(&results, sizeof(P) * size, stream);
|
||||
|
||||
//copy inputs to device
|
||||
cudaMemcpy(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpyAsync(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
|
||||
|
||||
//convert to projective representation and multiply each point by its scalar using single scalar multiplication
|
||||
unsigned NUM_THREADS = size;
|
||||
to_proj_kernel<<<1,NUM_THREADS>>>(a_points, p_points, size);
|
||||
ssm_kernel<<<1,NUM_THREADS>>>(scalars, p_points, results, size);
|
||||
to_proj_kernel<<<1,NUM_THREADS, 0, stream>>>(a_points, p_points, size);
|
||||
ssm_kernel<<<1,NUM_THREADS, 0, stream>>>(scalars, p_points, results, size);
|
||||
|
||||
P *final_result;
|
||||
cudaMalloc(&final_result, sizeof(P));
|
||||
cudaMallocAsync(&final_result, sizeof(P), stream);
|
||||
|
||||
//assuming msm size is a power of 2
|
||||
//sum all the ssm results
|
||||
NUM_THREADS = size;
|
||||
sum_reduction_kernel<<<1,NUM_THREADS>>>(results, final_result);
|
||||
sum_reduction_kernel<<<1,NUM_THREADS, 0, stream>>>(results, final_result);
|
||||
|
||||
//copy result to host
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaMemcpyAsync(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
//free memory
|
||||
cudaFree(scalars);
|
||||
cudaFree(a_points);
|
||||
cudaFree(p_points);
|
||||
cudaFree(results);
|
||||
cudaFree(final_result);
|
||||
cudaFreeAsync(scalars, stream);
|
||||
cudaFreeAsync(a_points, stream);
|
||||
cudaFreeAsync(p_points, stream);
|
||||
cudaFreeAsync(results, stream);
|
||||
cudaFreeAsync(final_result, stream);
|
||||
|
||||
}
|
||||
|
||||
@@ -529,21 +538,21 @@ unsigned get_optimal_c(const unsigned size) {
|
||||
|
||||
//this function is used to compute msms of size larger than 256
|
||||
template <typename S, typename P, typename A>
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device){
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream){
|
||||
unsigned c = get_optimal_c(size);
|
||||
// unsigned c = 6;
|
||||
// unsigned bitsize = 32;
|
||||
unsigned bitsize = 255;
|
||||
bucket_method_msm(bitsize, c, scalars, points, size, result, on_device);
|
||||
bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, stream);
|
||||
}
|
||||
|
||||
// this function is used to compute a batches of msms of size larger than 256
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device){
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream){
|
||||
unsigned c = get_optimal_c(msm_size);
|
||||
// unsigned c = 6;
|
||||
// unsigned bitsize = 32;
|
||||
unsigned bitsize = 255;
|
||||
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device);
|
||||
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3,19 +3,19 @@
|
||||
#pragma once
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device);
|
||||
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device);
|
||||
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device);
|
||||
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device);
|
||||
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);
|
||||
|
||||
template <typename S, typename P, typename A>
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device);
|
||||
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);
|
||||
|
||||
template <typename A, typename S, typename P>
|
||||
void reference_msm(S* scalars, A* a_points, unsigned size);
|
||||
|
||||
@@ -15,19 +15,20 @@
|
||||
* @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
|
||||
* @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
|
||||
*/
|
||||
template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) {
|
||||
template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice);
|
||||
cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream);
|
||||
|
||||
int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
|
||||
int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1);
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
|
||||
}
|
||||
|
||||
NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>> (d_out, n * batch_size, S::inv_log_size(logn));
|
||||
template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>> (d_out, n * batch_size, S::inv_log_size(logn));
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -39,8 +40,8 @@ template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluat
|
||||
* @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
|
||||
* @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
|
||||
*/
|
||||
template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) {
|
||||
return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1);
|
||||
template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) {
|
||||
return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, stream);
|
||||
}
|
||||
|
||||
template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
|
||||
@@ -62,7 +63,7 @@ template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
|
||||
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) {
|
||||
int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
|
||||
uint32_t logn = uint32_t(log(domain_size) / log(2));
|
||||
if (domain_size > n) {
|
||||
// allocate and initialize an array of stream handles to parallelize data copying across batches
|
||||
@@ -80,18 +81,19 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
|
||||
cudaStreamDestroy(memcpy_streams[i]);
|
||||
}
|
||||
} else
|
||||
cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice);
|
||||
cudaMemcpyAsync(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice, stream);
|
||||
|
||||
if (coset)
|
||||
batch_vector_mult(coset_powers, d_out, domain_size, batch_size);
|
||||
batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);
|
||||
|
||||
int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH);
|
||||
int chunks = max(int((domain_size / 2) / NUM_THREADS), 1);
|
||||
int NUM_BLOCKS = batch_size * chunks;
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
{
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
|
||||
ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
|
||||
}
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -107,76 +109,76 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
|
||||
* @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
|
||||
*/
|
||||
template <typename E, typename S>
|
||||
int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) {
|
||||
return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers);
|
||||
int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
|
||||
return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
|
||||
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
|
||||
S* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) {
|
||||
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
|
||||
S* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
|
||||
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) {
|
||||
S* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned batch_size) {
|
||||
unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) {
|
||||
S* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, S* coset_powers) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, S* coset_powers) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, S* coset_powers) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, S* coset_powers) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
#endif
|
||||
@@ -3,44 +3,44 @@
|
||||
#pragma once
|
||||
|
||||
template <typename S>
|
||||
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n);
|
||||
int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);
|
||||
|
||||
template <typename S>
|
||||
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
|
||||
int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n);
|
||||
int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
|
||||
int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream);
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
|
||||
int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size);
|
||||
int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
|
||||
int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned batch_size);
|
||||
unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream);
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, S* coset_powers);
|
||||
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
|
||||
|
||||
template <typename S>
|
||||
int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, S* coset_powers);
|
||||
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain,
|
||||
unsigned domain_size, unsigned n, S* coset_powers);
|
||||
unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream);
|
||||
|
||||
template <typename E, typename S>
|
||||
int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, S* coset_powers);
|
||||
unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream);
|
||||
|
||||
#endif
|
||||
@@ -28,11 +28,12 @@ const uint32_t MAX_THREADS_BATCH = 256;
|
||||
* @param n_twiddles number of twiddle factors.
|
||||
* @param omega multiplying factor.
|
||||
*/
|
||||
template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega) {
|
||||
template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega, cudaStream_t stream) {
|
||||
size_t size_twiddles = n_twiddles * sizeof(S);
|
||||
S * d_twiddles;
|
||||
cudaMalloc( & d_twiddles, size_twiddles);
|
||||
twiddle_factors_kernel<S> <<< 1, 1 >>> (d_twiddles, n_twiddles, omega);
|
||||
cudaMallocAsync(& d_twiddles, size_twiddles, stream);
|
||||
twiddle_factors_kernel<S> <<< 1, 1, 0, stream>>> (d_twiddles, n_twiddles, omega);
|
||||
cudaStreamSynchronize(stream);
|
||||
return d_twiddles;
|
||||
}
|
||||
|
||||
@@ -89,14 +90,14 @@ template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reve
|
||||
* @param logn log(n).
|
||||
* @param batch_size the size of the batch.
|
||||
*/
|
||||
template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size) {
|
||||
template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) {
|
||||
T* arr_reversed;
|
||||
cudaMalloc(&arr_reversed, n * batch_size * sizeof(T));
|
||||
cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(T), stream);
|
||||
int number_of_threads = MAX_THREADS_BATCH;
|
||||
int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
|
||||
reverse_order_kernel <<<number_of_blocks, number_of_threads>>> (arr, arr_reversed, n, logn, batch_size);
|
||||
cudaMemcpy(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice);
|
||||
cudaFree(arr_reversed);
|
||||
reverse_order_kernel <<<number_of_blocks, number_of_threads, 0, stream>>> (arr, arr_reversed, n, logn, batch_size);
|
||||
cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
|
||||
cudaFreeAsync(arr_reversed, stream);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -107,8 +108,8 @@ template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t lo
|
||||
* @param n length of `arr`.
|
||||
* @param logn log(n).
|
||||
*/
|
||||
template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn) {
|
||||
reverse_order_batch(arr, n, logn, 1);
|
||||
template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream) {
|
||||
reverse_order_batch(arr, n, logn, 1, stream);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -155,14 +156,15 @@ template < typename E, typename S > __global__ void template_normalize_kernel(E
|
||||
* @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
|
||||
* @param n_twiddles length of d_twiddles.
|
||||
*/
|
||||
template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles) {
|
||||
template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles, cudaStream_t stream) {
|
||||
uint32_t m = 2;
|
||||
// TODO: optimize with separate streams for each iteration
|
||||
for (uint32_t s = 0; s < logn; s++) {
|
||||
for (uint32_t i = 0; i < n; i += m) {
|
||||
uint32_t shifted_m = m >> 1;
|
||||
uint32_t number_of_threads = MAX_NUM_THREADS ^ ((shifted_m ^ MAX_NUM_THREADS) & -(shifted_m < MAX_NUM_THREADS));
|
||||
uint32_t number_of_blocks = shifted_m / MAX_NUM_THREADS + 1;
|
||||
template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1);
|
||||
template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks, 0, stream >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1);
|
||||
}
|
||||
m <<= 1;
|
||||
}
|
||||
@@ -177,21 +179,22 @@ template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr
|
||||
* @param n_twiddles length of d_twiddles.
|
||||
* @param inverse indicate if the result array should be normalized by n^(-1).
|
||||
*/
|
||||
template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_twiddles, uint32_t n_twiddles, bool inverse) {
|
||||
template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_twiddles, uint32_t n_twiddles, bool inverse, cudaStream_t stream) {
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
size_t size_E = n * sizeof(E);
|
||||
E * arrReversed = template_reverse_order < E > (arr, n, logn);
|
||||
E * d_arrReversed;
|
||||
cudaMalloc( & d_arrReversed, size_E);
|
||||
cudaMemcpy(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice);
|
||||
template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles);
|
||||
cudaMallocAsync( & d_arrReversed, size_E, stream);
|
||||
cudaMemcpyAsync(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice, stream);
|
||||
template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles, stream);
|
||||
if (inverse) {
|
||||
int NUM_THREADS = MAX_NUM_THREADS;
|
||||
int NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arrReversed, n, S::inv_log_size(logn));
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS, 0, stream >>> (d_arrReversed, n, S::inv_log_size(logn));
|
||||
}
|
||||
cudaMemcpy(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_arrReversed);
|
||||
cudaMemcpyAsync(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost, stream);
|
||||
cudaFreeAsync(d_arrReversed, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return arrReversed;
|
||||
}
|
||||
|
||||
@@ -201,21 +204,22 @@ template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_
|
||||
* @param n length of d_arr.
|
||||
* @param inverse indicate if the result array should be normalized by n^(-1).
|
||||
*/
|
||||
template<typename E,typename S> uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse) {
|
||||
template<typename E,typename S> uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse, cudaStream_t stream) {
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
uint32_t n_twiddles = n;
|
||||
S * twiddles = new S[n_twiddles];
|
||||
S * d_twiddles;
|
||||
if (inverse){
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn));
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn), stream);
|
||||
} else{
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn));
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn), stream);
|
||||
}
|
||||
E * result = ntt_template < E, S > (arr, n, d_twiddles, n_twiddles, inverse);
|
||||
E * result = ntt_template < E, S > (arr, n, d_twiddles, n_twiddles, inverse, stream);
|
||||
for(int i = 0; i < n; i++){
|
||||
arr[i] = result[i];
|
||||
}
|
||||
cudaFree(d_twiddles);
|
||||
cudaFreeAsync(d_twiddles, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0; // TODO add
|
||||
}
|
||||
|
||||
@@ -336,42 +340,45 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u
|
||||
* @param n size of batch.
|
||||
* @param inverse indicate if the result array should be normalized by n^(-1).
|
||||
*/
|
||||
template <typename E, typename S> uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse) {
|
||||
template <typename E, typename S> uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse, cudaStream_t stream) {
|
||||
int batches = int(arr_size / n);
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
uint32_t n_twiddles = n; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
|
||||
size_t size_E = arr_size * sizeof(E);
|
||||
S * d_twiddles;
|
||||
if (inverse){
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn));
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn), stream);
|
||||
} else{
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn));
|
||||
d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn), stream);
|
||||
}
|
||||
E * d_arr;
|
||||
cudaMalloc( & d_arr, size_E);
|
||||
cudaMemcpy(d_arr, arr, size_E, cudaMemcpyHostToDevice);
|
||||
cudaMallocAsync( & d_arr, size_E, stream);
|
||||
cudaMemcpyAsync(d_arr, arr, size_E, cudaMemcpyHostToDevice, stream);
|
||||
int NUM_THREADS = MAX_THREADS_BATCH;
|
||||
int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS;
|
||||
ntt_template_kernel_rev_ord<E, S><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, logn, batches);
|
||||
ntt_template_kernel_rev_ord<E, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_arr, n, logn, batches);
|
||||
|
||||
NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
|
||||
int chunks = max(int((n / 2) / NUM_THREADS), 1);
|
||||
int total_tasks = batches * chunks;
|
||||
NUM_BLOCKS = total_tasks;
|
||||
|
||||
for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
|
||||
//TODO: this loop also can be unrolled
|
||||
for (uint32_t s = 0; s < logn; s++)
|
||||
{
|
||||
ntt_template_kernel<E, S><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false);
|
||||
ntt_template_kernel<E, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false);
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
if (inverse == true)
|
||||
{
|
||||
NUM_THREADS = MAX_NUM_THREADS;
|
||||
NUM_BLOCKS = (arr_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, arr_size, S::inv_log_size(logn));
|
||||
template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS, 0, stream>>> (d_arr, arr_size, S::inv_log_size(logn));
|
||||
}
|
||||
cudaMemcpy(arr, d_arr, size_E, cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_arr);
|
||||
cudaFree(d_twiddles);
|
||||
cudaMemcpyAsync(arr, d_arr, size_E, cudaMemcpyDeviceToHost, stream);
|
||||
cudaFreeAsync(d_arr, stream);
|
||||
cudaFreeAsync(d_twiddles, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ __global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in place so no need for third result vector
|
||||
int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need for third result vector
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
|
||||
@@ -28,23 +28,24 @@ int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
S *d_vec_a;
|
||||
E *d_vec_b, *d_result;
|
||||
cudaMalloc(&d_vec_a, n_elments * sizeof(S));
|
||||
cudaMalloc(&d_vec_b, n_elments * sizeof(E));
|
||||
cudaMalloc(&d_result, n_elments * sizeof(E));
|
||||
cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream);
|
||||
cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream);
|
||||
cudaMallocAsync(&d_result, n_elments * sizeof(E), stream);
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice);
|
||||
cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
|
||||
vectorModMult<<<num_blocks, threads_per_block, 0, stream>>>(d_vec_a, d_vec_b, d_result, n_elments);
|
||||
|
||||
cudaMemcpy(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
cudaFree(d_vec_a);
|
||||
cudaFree(d_vec_b);
|
||||
cudaFree(d_result);
|
||||
cudaFreeAsync(d_vec_a, stream);
|
||||
cudaFreeAsync(d_vec_b, stream);
|
||||
cudaFreeAsync(d_result, stream);
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -60,12 +61,12 @@ __global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalar
|
||||
}
|
||||
|
||||
template <typename E, typename S>
|
||||
int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size)
|
||||
int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int NUM_THREADS = MAX_THREADS_PER_BLOCK;
|
||||
int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
batchVectorMult<<<NUM_BLOCKS, NUM_THREADS>>>(scalar_vec, element_vec, n_scalars, batch_size);
|
||||
batchVectorMult<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(scalar_vec, element_vec, n_scalars, batch_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -83,7 +84,7 @@ __global__ void matrixVectorMult(E *matrix_elements, E *vector_elements, E *resu
|
||||
}
|
||||
|
||||
template <typename E>
|
||||
int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim)
|
||||
int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim, cudaStream_t stream)
|
||||
{
|
||||
// Set the grid and block dimensions
|
||||
int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK);
|
||||
@@ -91,23 +92,24 @@ int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t di
|
||||
|
||||
// Allocate memory on the device for the input vectors, the output vector, and the modulus
|
||||
E *d_matrix, *d_vector, *d_result;
|
||||
cudaMalloc(&d_matrix, (dim * dim) * sizeof(E));
|
||||
cudaMalloc(&d_vector, dim * sizeof(E));
|
||||
cudaMalloc(&d_result, dim * sizeof(E));
|
||||
cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream);
|
||||
cudaMallocAsync(&d_vector, dim * sizeof(E), stream);
|
||||
cudaMallocAsync(&d_result, dim * sizeof(E), stream);
|
||||
|
||||
// Copy the input vectors and the modulus from the host to the device
|
||||
cudaMemcpy(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice);
|
||||
cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream);
|
||||
|
||||
// Call the kernel to perform element-wise modular multiplication
|
||||
matrixVectorMult<<<num_blocks, threads_per_block>>>(d_matrix, d_vector, d_result, dim);
|
||||
matrixVectorMult<<<num_blocks, threads_per_block, 0, stream>>>(d_matrix, d_vector, d_result, dim);
|
||||
|
||||
cudaMemcpy(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
cudaFree(d_matrix);
|
||||
cudaFree(d_vector);
|
||||
cudaFree(d_result);
|
||||
cudaFreeAsync(d_matrix, stream);
|
||||
cudaFreeAsync(d_vector, stream);
|
||||
cudaFreeAsync(d_result, stream);
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
@@ -6,14 +6,15 @@
|
||||
#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
|
||||
#include "curve_config.cuh"
|
||||
|
||||
extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0)
|
||||
extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
cudaStreamCreate(&stream);
|
||||
if (inverse) {
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega_inv(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega_inv(logn), stream);
|
||||
} else {
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega(logn), stream);
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +24,12 @@ extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BLS12_377::scalar_t,BLS12_377::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BLS12_377::scalar_t,BLS12_377::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inv
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BLS12_377::projective_t,BLS12_377::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BLS12_377::projective_t,BLS12_377::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bo
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BLS12_377::scalar_t,BLS12_377::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BLS12_377::scalar_t,BLS12_377::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_s
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BLS12_377::projective_t,BLS12_377::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BLS12_377::projective_t,BLS12_377::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, unsigned device_id = 0)
|
||||
extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BL
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_evaluations, BLS12_377::scalar_t* d_domain, unsigned n,
|
||||
unsigned batch_size, size_t device_id = 0)
|
||||
unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_o
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, size_t device_id = 0)
|
||||
extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out,
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_evaluations, BLS12_377::scalar_t* d_domain,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t*
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_coefficients, BLS12_377::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0)
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_377::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_377::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out,
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_coefficients, BLS12_377::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0)
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_377::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BL
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_377::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_coefficients, BLS12_377::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BLS12_377::scalar_t *coset_powers, unsigned device_id = 0)
|
||||
unsigned n, BLS12_377::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bls12_377(BLS12_377::scalar_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_377(BLS12_377::scalar_
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_coefficients, BLS12_377::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BLS12_377::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bls12_377(BLS12_377::projective_t*
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bls12_377(BLS12_377::projecti
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, in
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* a
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr,
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_batch_cuda_bls12_377(BLS12_377::projective_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_batch_cuda_bls12_377(BLS12_377::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
|
||||
extern "C"
|
||||
int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[],
|
||||
BLS12_377::scalar_t scalars[], size_t count, size_t device_id = 0)
|
||||
BLS12_377::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false);
|
||||
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +23,14 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[
|
||||
}
|
||||
|
||||
extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377::affine_t points[],
|
||||
BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0)
|
||||
BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, batch_size, msm_size, out, false);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -46,11 +49,12 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
|
||||
* @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
|
||||
*/
|
||||
extern "C"
|
||||
int commit_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t device_id = 0)
|
||||
int commit_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true);
|
||||
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -70,11 +74,13 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
|
||||
* @param batch_size Size of the batch.
|
||||
*/
|
||||
extern "C"
|
||||
int commit_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0)
|
||||
int commit_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -12,12 +12,13 @@
|
||||
extern "C" int32_t vec_mod_mult_point_bls12_377(BLS12_377::projective_t *inout,
|
||||
BLS12_377::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BLS12_377::projective_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BLS12_377::projective_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -30,12 +31,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_377(BLS12_377::projective_t *inout,
|
||||
extern "C" int32_t vec_mod_mult_scalar_bls12_377(BLS12_377::scalar_t *inout,
|
||||
BLS12_377::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BLS12_377::scalar_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BLS12_377::scalar_t, BLS12_377::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -49,12 +51,13 @@ extern "C" int32_t matrix_vec_mod_mult_bls12_377(BLS12_377::scalar_t *matrix_fla
|
||||
BLS12_377::scalar_t *input,
|
||||
BLS12_377::scalar_t *output,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
matrix_mod_mult<BLS12_377::scalar_t>(matrix_flattened, input, output, n_elments);
|
||||
matrix_mod_mult<BLS12_377::scalar_t>(matrix_flattened, input, output, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -6,14 +6,15 @@
|
||||
#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
|
||||
#include "curve_config.cuh"
|
||||
|
||||
extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0)
|
||||
extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
cudaStreamCreate(&stream);
|
||||
if (inverse) {
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn), stream);
|
||||
} else {
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn), stream);
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +24,12 @@ extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inv
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bo
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BLS12_381::scalar_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_s
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BLS12_381::projective_t,BLS12_381::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, unsigned device_id = 0)
|
||||
extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BL
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_evaluations, BLS12_381::scalar_t* d_domain, unsigned n,
|
||||
unsigned batch_size, size_t device_id = 0)
|
||||
unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_o
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, size_t device_id = 0)
|
||||
extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out,
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_evaluations, BLS12_381::scalar_t* d_domain,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t*
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0)
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_381::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_381::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out,
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0)
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_381::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BL
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BLS12_381::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BLS12_381::scalar_t *coset_powers, unsigned device_id = 0)
|
||||
unsigned n, BLS12_381::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381::scalar_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381::scalar_
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BLS12_381::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bls12_381(BLS12_381::projective_t*
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381::projecti
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, in
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* a
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr,
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_batch_cuda_bls12_381(BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_batch_cuda_bls12_381(BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
|
||||
extern "C"
|
||||
int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[],
|
||||
BLS12_381::scalar_t scalars[], size_t count, size_t device_id = 0)
|
||||
BLS12_381::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false);
|
||||
large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,12 +23,13 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[
|
||||
}
|
||||
|
||||
extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381::affine_t points[],
|
||||
BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0)
|
||||
BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, batch_size, msm_size, out, false);
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -47,11 +48,12 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:
|
||||
* @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
|
||||
*/
|
||||
extern "C"
|
||||
int commit_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t device_id = 0)
|
||||
int commit_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm(d_scalars, d_points, count, d_out, true);
|
||||
large_msm(d_scalars, d_points, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -71,11 +73,13 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:
|
||||
* @param batch_size Size of the batch.
|
||||
*/
|
||||
extern "C"
|
||||
int commit_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0)
|
||||
int commit_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true);
|
||||
{
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -11,12 +11,13 @@
|
||||
extern "C" int32_t vec_mod_mult_point_bls12_381(BLS12_381::projective_t *inout,
|
||||
BLS12_381::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BLS12_381::projective_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BLS12_381::projective_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -29,12 +30,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_381(BLS12_381::projective_t *inout,
|
||||
extern "C" int32_t vec_mod_mult_scalar_bls12_381(BLS12_381::scalar_t *inout,
|
||||
BLS12_381::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BLS12_381::scalar_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BLS12_381::scalar_t, BLS12_381::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -48,12 +50,13 @@ extern "C" int32_t matrix_vec_mod_mult_bls12_381(BLS12_381::scalar_t *matrix_fla
|
||||
BLS12_381::scalar_t *input,
|
||||
BLS12_381::scalar_t *output,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
matrix_mod_mult<BLS12_381::scalar_t>(matrix_flattened, input, output, n_elments);
|
||||
matrix_mod_mult<BLS12_381::scalar_t>(matrix_flattened, input, output, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -6,14 +6,15 @@
|
||||
#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
|
||||
#include "curve_config.cuh"
|
||||
|
||||
extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0)
|
||||
extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
cudaStreamCreate(&stream);
|
||||
if (inverse) {
|
||||
return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega_inv(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega_inv(logn), stream);
|
||||
} else {
|
||||
return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega(logn));
|
||||
return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega(logn), stream);
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +24,12 @@ extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BN254::scalar_t,BN254::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BN254::scalar_t,BN254::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, si
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<BN254::projective_t,BN254::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<BN254::projective_t,BN254::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inver
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BN254::scalar_t,BN254::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BN254::scalar_t,BN254::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uin
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<BN254::projective_t,BN254::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<BN254::projective_t,BN254::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_siz
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, unsigned device_id = 0)
|
||||
extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::sca
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_evaluations, BN254::scalar_t* d_domain, unsigned n,
|
||||
unsigned batch_size, size_t device_id = 0)
|
||||
unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN25
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, size_t device_id = 0)
|
||||
extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_evaluations, BN254::scalar_t* d_domain,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, B
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_coefficients, BN254::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0)
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BN254::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BN254::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_coefficients, BN254::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0)
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BN254::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bn254(BN254::projective_t* d_out, BN254::pro
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
BN254::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN25
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_coefficients, BN254::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0)
|
||||
unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN25
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bn254(BN254::scalar_t* d_out
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_coefficients, BN254::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, BN254::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bn254(BN254::projective_t* d_out, B
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bn254(BN254::projective_t* d_
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, siz
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n,
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_batch_cuda_bn254(BN254::projective_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_batch_cuda_bn254(BN254::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
|
||||
extern "C"
|
||||
int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
|
||||
BN254::scalar_t scalars[], size_t count, size_t device_id = 0)
|
||||
BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false);
|
||||
large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +23,13 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
|
||||
}
|
||||
|
||||
extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t points[],
|
||||
BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0)
|
||||
BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, batch_size, msm_size, out, false);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -46,11 +48,12 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po
|
||||
* @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
|
||||
*/
|
||||
extern "C"
|
||||
int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0)
|
||||
int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm(d_scalars, d_points, count, d_out, true);
|
||||
large_msm(d_scalars, d_points, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -70,11 +73,13 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po
|
||||
* @param batch_size Size of the batch.
|
||||
*/
|
||||
extern "C"
|
||||
int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0)
|
||||
int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -12,14 +12,15 @@
|
||||
extern "C" int32_t vec_mod_mult_point_bn254(BN254::projective_t *inout,
|
||||
BN254::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
// TODO: use device_id when working with multiple devices
|
||||
(void)device_id;
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BN254::projective_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BN254::projective_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -32,14 +33,15 @@ extern "C" int32_t vec_mod_mult_point_bn254(BN254::projective_t *inout,
|
||||
extern "C" int32_t vec_mod_mult_scalar_bn254(BN254::scalar_t *inout,
|
||||
BN254::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
// TODO: use device_id when working with multiple devices
|
||||
(void)device_id;
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<BN254::scalar_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<BN254::scalar_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -53,14 +55,15 @@ extern "C" int32_t matrix_vec_mod_mult_bn254(BN254::scalar_t *matrix_flattened,
|
||||
BN254::scalar_t *input,
|
||||
BN254::scalar_t *output,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
// TODO: use device_id when working with multiple devices
|
||||
(void)device_id;
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
matrix_mod_mult<BN254::scalar_t>(matrix_flattened, input, output, n_elments);
|
||||
matrix_mod_mult<BN254::scalar_t>(matrix_flattened, input, output, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -6,14 +6,15 @@
|
||||
#include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
|
||||
#include "curve_config.cuh"
|
||||
|
||||
extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0)
|
||||
extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
cudaStreamCreate(&stream);
|
||||
if (inverse) {
|
||||
return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega_inv(logn));
|
||||
return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega_inv(logn), stream);
|
||||
} else {
|
||||
return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega(logn));
|
||||
return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega(logn), stream);
|
||||
}
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +24,12 @@ extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domai
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<CURVE_NAME_U::scalar_t,CURVE_NAME_U::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<CURVE_NAME_U::scalar_t,CURVE_NAME_U::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -37,11 +39,12 @@ extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bo
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_template<CURVE_NAME_U::projective_t,CURVE_NAME_U::scalar_t>(arr, n, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_template<CURVE_NAME_U::projective_t,CURVE_NAME_U::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<CURVE_NAME_U::scalar_t,CURVE_NAME_U::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<CURVE_NAME_U::scalar_t,CURVE_NAME_U::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0)
|
||||
extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return ntt_end2end_batch_template<CURVE_NAME_U::projective_t,CURVE_NAME_U::scalar_t>(arr, arr_size, batch_size, inverse); // TODO: pass device_id
|
||||
cudaStreamCreate(&stream);
|
||||
return ntt_end2end_batch_template<CURVE_NAME_U::projective_t,CURVE_NAME_U::scalar_t>(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, ui
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, unsigned device_id = 0)
|
||||
extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int interpolate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_evaluations, CURVE_NAME_U::scalar_t* d_domain, unsigned n,
|
||||
unsigned batch_size, size_t device_id = 0)
|
||||
unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, size_t device_id = 0)
|
||||
extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate(d_out, d_evaluations, d_domain, n);
|
||||
return interpolate(d_out, d_evaluations, d_domain, n, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t*
|
||||
}
|
||||
|
||||
extern "C" int interpolate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_evaluations, CURVE_NAME_U::scalar_t* d_domain,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projecti
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0)
|
||||
unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
CURVE_NAME_U::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out,
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
CURVE_NAME_U::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t*
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain,
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0)
|
||||
unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
CURVE_NAME_U::scalar_t* _null = nullptr;
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_o
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
CURVE_NAME_U::scalar_t* _null = nullptr;
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, CURVE_NAME_U::scalar_t *coset_powers, unsigned device_id = 0)
|
||||
unsigned n, CURVE_NAME_U::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_
|
||||
}
|
||||
|
||||
extern "C" int evaluate_scalars_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::s
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, unsigned domain_size,
|
||||
unsigned n, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::projecti
|
||||
}
|
||||
|
||||
extern "C" int evaluate_points_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size,
|
||||
unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0)
|
||||
unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
|
||||
cudaStreamCreate(&stream);
|
||||
return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream);
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
{
|
||||
@@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::pr
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* a
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scala
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order(arr, n, logn);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order(arr, n, logn, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int reverse_order_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, int batch_size, size_t device_id = 0)
|
||||
extern "C" int reverse_order_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
uint32_t logn = uint32_t(log(n) / log(2));
|
||||
reverse_order_batch(arr, n, logn, batch_size);
|
||||
cudaStreamCreate(&stream);
|
||||
reverse_order_batch(arr, n, logn, batch_size, stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
|
||||
extern "C"
|
||||
int msm_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *out, CURVE_NAME_U::affine_t points[],
|
||||
CURVE_NAME_U::scalar_t scalars[], size_t count, size_t device_id = 0)
|
||||
CURVE_NAME_U::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm<CURVE_NAME_U::scalar_t, CURVE_NAME_U::projective_t, CURVE_NAME_U::affine_t>(scalars, points, count, out, false);
|
||||
large_msm<CURVE_NAME_U::scalar_t, CURVE_NAME_U::projective_t, CURVE_NAME_U::affine_t>(scalars, points, count, out, false, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -23,11 +23,13 @@ int msm_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *out, CURVE_NAME_U::affine_
|
||||
}
|
||||
|
||||
extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURVE_NAME_U::affine_t points[],
|
||||
CURVE_NAME_U::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0)
|
||||
CURVE_NAME_U::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm<CURVE_NAME_U::scalar_t, CURVE_NAME_U::projective_t, CURVE_NAME_U::affine_t>(scalars, points, batch_size, msm_size, out, false);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm<CURVE_NAME_U::scalar_t, CURVE_NAME_U::projective_t, CURVE_NAME_U::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
@@ -47,11 +49,12 @@ extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURV
|
||||
* @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
|
||||
*/
|
||||
extern "C"
|
||||
int commit_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t device_id = 0)
|
||||
int commit_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
large_msm(d_scalars, d_points, count, d_out, true);
|
||||
large_msm(d_scalars, d_points, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -71,11 +74,13 @@ extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURV
|
||||
* @param batch_size Size of the batch.
|
||||
*/
|
||||
extern "C"
|
||||
int commit_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0)
|
||||
int commit_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true);
|
||||
cudaStreamCreate(&stream);
|
||||
batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
return 0;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
@@ -12,12 +12,13 @@
|
||||
extern "C" int32_t vec_mod_mult_point_CURVE_NAME_L(CURVE_NAME_U::projective_t *inout,
|
||||
CURVE_NAME_U::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<CURVE_NAME_U::projective_t, CURVE_NAME_U::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<CURVE_NAME_U::projective_t, CURVE_NAME_U::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -30,12 +31,13 @@ extern "C" int32_t vec_mod_mult_point_CURVE_NAME_L(CURVE_NAME_U::projective_t *i
|
||||
extern "C" int32_t vec_mod_mult_scalar_CURVE_NAME_L(CURVE_NAME_U::scalar_t *inout,
|
||||
CURVE_NAME_U::scalar_t *scalar_vec,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
vector_mod_mult<CURVE_NAME_U::scalar_t, CURVE_NAME_U::scalar_t>(scalar_vec, inout, inout, n_elments);
|
||||
vector_mod_mult<CURVE_NAME_U::scalar_t, CURVE_NAME_U::scalar_t>(scalar_vec, inout, inout, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
@@ -49,12 +51,13 @@ extern "C" int32_t matrix_vec_mod_mult_CURVE_NAME_L(CURVE_NAME_U::scalar_t *matr
|
||||
CURVE_NAME_U::scalar_t *input,
|
||||
CURVE_NAME_U::scalar_t *output,
|
||||
size_t n_elments,
|
||||
size_t device_id)
|
||||
size_t device_id,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: device_id
|
||||
matrix_mod_mult<CURVE_NAME_U::scalar_t>(matrix_flattened, input, output, n_elments);
|
||||
matrix_mod_mult<CURVE_NAME_U::scalar_t>(matrix_flattened, input, output, n_elments, stream);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
catch (const std::runtime_error &ex)
|
||||
|
||||
Reference in New Issue
Block a user