From 20de60fd431daeca821f9d3588b1954af398a8bc Mon Sep 17 00:00:00 2001 From: Jeremy Felder Date: Wed, 31 May 2023 13:38:46 +0300 Subject: [PATCH] Add streams capability (#89) --- .../pull_request_template.md | 2 +- .github/workflows/build.yml | 1 + build.rs | 4 + icicle/appUtils/msm/msm.cu | 261 +++++++++--------- icicle/appUtils/msm/msm.cuh | 10 +- icicle/appUtils/ntt/lde.cu | 74 ++--- icicle/appUtils/ntt/lde.cuh | 24 +- icicle/appUtils/ntt/ntt.cuh | 79 +++--- .../vector_manipulation/ve_mod_mult.cuh | 50 ++-- icicle/curves/bls12_377/lde.cu | 105 ++++--- icicle/curves/bls12_377/msm.cu | 22 +- icicle/curves/bls12_377/ve_mod_mult.cu | 15 +- icicle/curves/bls12_381/lde.cu | 105 ++++--- icicle/curves/bls12_381/msm.cu | 24 +- icicle/curves/bls12_381/ve_mod_mult.cu | 15 +- icicle/curves/bn254/lde.cu | 105 ++++--- icicle/curves/bn254/msm.cu | 21 +- icicle/curves/bn254/ve_mod_mult.cu | 15 +- icicle/curves/curve_template/lde.cu | 105 ++++--- icicle/curves/curve_template/msm.cu | 21 +- icicle/curves/curve_template/ve_mod_mult.cu | 15 +- 21 files changed, 603 insertions(+), 470 deletions(-) rename .github/{PULL_REQUEST_TEMPLATE => }/pull_request_template.md (83%) diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/pull_request_template.md similarity index 83% rename from .github/PULL_REQUEST_TEMPLATE/pull_request_template.md rename to .github/pull_request_template.md index 57e904bc..560e03ee 100644 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,4 +4,4 @@ This PR... ## Linked Issues -Closes # +Resolves # diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 84954655..6980cd4a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,6 +12,7 @@ on: env: CARGO_TERM_COLOR: always ARCH_TYPE: sm_70 + DEFAULT_STREAM: per-thread jobs: build-linux: diff --git a/build.rs b/build.rs index ce2a64f2..0a40fbd4 100644 --- a/build.rs +++ b/build.rs @@ -8,9 +8,12 @@ fn main() { println!("cargo:rerun-if-changed=./icicle"); let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native")); + let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy")); let mut arch = String::from("-arch="); arch.push_str(&arch_type); + let mut stream = String::from("-default-stream="); + stream.push_str(&stream_type); let mut nvcc = cc::Build::new(); @@ -22,6 +25,7 @@ fn main() { nvcc.cuda(true); nvcc.debug(false); nvcc.flag(&arch); + nvcc.flag(&stream); nvcc.files([ "./icicle/curves/index.cu", ]); diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index b3391e8d..9bb6b459 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -88,7 +88,7 @@ template __global__ void accumulate_buckets_kernel(P *buckets, unsigned *bucket_offsets, unsigned *bucket_sizes, unsigned *single_bucket_indices, unsigned *point_indices, A *points, unsigned nof_buckets, unsigned *nof_buckets_to_compute, unsigned msm_idx_shift){ unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid>=*nof_buckets_to_compute){ + if (tid >= *nof_buckets_to_compute){ return; } unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift; @@ -106,7 +106,7 @@ template __global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c){ unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid>=nof_bms) return; + if (tid >= nof_bms) return; P line_sum = buckets[(tid+1)*(1<0; i--) @@ -152,16 +152,16 @@ __global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsig //this function computes msm using the bucket method template -void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device) { +void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream) { S *d_scalars; A *d_points; if (!on_device) { //copy scalars and point to gpu - cudaMalloc(&d_scalars, sizeof(S) * size); - cudaMalloc(&d_points, sizeof(A) * size); - cudaMemcpy(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice); - cudaMemcpy(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice); + cudaMallocAsync(&d_scalars, sizeof(S) * size, stream); + cudaMallocAsync(&d_points, sizeof(A) * size, stream); + cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream); } else { d_scalars = scalars; @@ -178,135 +178,140 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi nof_bms++; } unsigned nof_buckets = nof_bms<>>(buckets, nof_buckets); + initialize_buckets_kernel<<>>(buckets, nof_buckets); unsigned *bucket_indices; unsigned *point_indices; - cudaMalloc(&bucket_indices, sizeof(unsigned) * size * (nof_bms+1)); - cudaMalloc(&point_indices, sizeof(unsigned) * size * (nof_bms+1)); + cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms+1), stream); + cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms+1), stream); //split scalars into digits NUM_THREADS = 1 << 10; NUM_BLOCKS = (size * (nof_bms+1) + NUM_THREADS - 1) / NUM_THREADS; - split_scalars_kernel<<>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, + split_scalars_kernel<<>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later //sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket unsigned *sort_indices_temp_storage{}; size_t sort_indices_temp_storage_bytes; + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices, - point_indices + size, point_indices, size); - - cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes); + point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream); + cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream); for (unsigned i = 0; i < nof_bms; i++) { unsigned offset_out = i * size; unsigned offset_in = offset_out + size; - cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, - bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, size); + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info + cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, bucket_indices + offset_out, + point_indices + offset_in, point_indices + offset_out, size, 0, sizeof(unsigned) * 8, stream); } - cudaFree(sort_indices_temp_storage); + cudaFreeAsync(sort_indices_temp_storage, stream); //find bucket_sizes unsigned *single_bucket_indices; unsigned *bucket_sizes; unsigned *nof_buckets_to_compute; - cudaMalloc(&single_bucket_indices, sizeof(unsigned)*nof_buckets); - cudaMalloc(&bucket_sizes, sizeof(unsigned)*nof_buckets); - cudaMalloc(&nof_buckets_to_compute, sizeof(unsigned)); + cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*nof_buckets, stream); + cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*nof_buckets, stream); + cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream); unsigned *encode_temp_storage{}; size_t encode_temp_storage_bytes = 0; cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms*size); - cudaMalloc(&encode_temp_storage, encode_temp_storage_bytes); + nof_buckets_to_compute, nof_bms*size, stream); + cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream); cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms*size); - cudaFree(encode_temp_storage); + nof_buckets_to_compute, nof_bms*size, stream); + cudaFreeAsync(encode_temp_storage, stream); //get offsets - where does each new bucket begin unsigned* bucket_offsets; - cudaMalloc(&bucket_offsets, sizeof(unsigned)*nof_buckets); + cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*nof_buckets, stream); unsigned* offsets_temp_storage{}; size_t offsets_temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets); - cudaMalloc(&offsets_temp_storage, offsets_temp_storage_bytes); - cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets); - cudaFree(offsets_temp_storage); + cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream); + cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream); + cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream); + cudaFreeAsync(offsets_temp_storage, stream); //launch the accumulation kernel with maximum threads NUM_THREADS = 1 << 8; NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - accumulate_buckets_kernel<<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices, - d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize); + accumulate_buckets_kernel<<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices, + d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize); #ifdef SSM_SUM //sum each bucket NUM_THREADS = 1 << 10; NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); + ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); //sum each bucket module P* final_results; - cudaMalloc(&final_results, sizeof(P) * nof_bms); + cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); NUM_THREADS = 1<>>(buckets, final_results); + sum_reduction_kernel<<>>(buckets, final_results); #endif #ifdef BIG_TRIANGLE P* final_results; - cudaMalloc(&final_results, sizeof(P) * nof_bms); + cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); //launch the bucket module sum kernel - a thread for each bucket module NUM_THREADS = nof_bms; NUM_BLOCKS = 1; - big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c); + big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms, c); #endif P* d_final_result; if (!on_device) - cudaMalloc(&d_final_result, sizeof(P)); + cudaMallocAsync(&d_final_result, sizeof(P), stream); //launch the double and add kernel, a single thread - final_accumulation_kernel<<<1,1>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c); + final_accumulation_kernel<<<1,1,0,stream>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c); //copy final result to host - cudaDeviceSynchronize(); + cudaStreamSynchronize(stream); if (!on_device) - cudaMemcpy(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream); //free memory if (!on_device) { - cudaFree(d_points); - cudaFree(d_scalars); - cudaFree(d_final_result); + cudaFreeAsync(d_points, stream); + cudaFreeAsync(d_scalars, stream); + cudaFreeAsync(d_final_result, stream); } - cudaFree(buckets); - cudaFree(bucket_indices); - cudaFree(point_indices); - cudaFree(single_bucket_indices); - cudaFree(bucket_sizes); - cudaFree(nof_buckets_to_compute); - cudaFree(bucket_offsets); - cudaFree(final_results); + cudaFreeAsync(buckets, stream); + cudaFreeAsync(bucket_indices, stream); + cudaFreeAsync(point_indices, stream); + cudaFreeAsync(single_bucket_indices, stream); + cudaFreeAsync(bucket_sizes, stream); + cudaFreeAsync(nof_buckets_to_compute, stream); + cudaFreeAsync(bucket_offsets, stream); + cudaFreeAsync(final_results, stream); + + cudaStreamSynchronize(stream); } //this function computes msm using the bucket method template -void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device){ +void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream){ unsigned total_size = batch_size * msm_size; S *d_scalars; A *d_points; if (!on_device) { //copy scalars and point to gpu - cudaMalloc(&d_scalars, sizeof(S) * total_size); - cudaMalloc(&d_points, sizeof(A) * total_size); - cudaMemcpy(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice); + cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream); + cudaMallocAsync(&d_points, sizeof(A) * total_size, stream); + cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice, stream); } else { d_scalars = scalars; @@ -323,125 +328,129 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin unsigned bm_bitsize = ceil(log2(nof_bms)); unsigned nof_buckets = (nof_bms<>>(buckets, total_nof_buckets); + initialize_buckets_kernel<<>>(buckets, total_nof_buckets); unsigned *bucket_indices; unsigned *point_indices; - cudaMalloc(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size)); - cudaMalloc(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size)); + cudaMallocAsync(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream); + cudaMallocAsync(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream); //split scalars into digits NUM_THREADS = 1 << 8; NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS; - split_scalars_kernel<<>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, + split_scalars_kernel<<>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later //sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket unsigned *sorted_bucket_indices; unsigned *sorted_point_indices; - cudaMalloc(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms)); - cudaMalloc(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms)); + cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream); + cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream); unsigned *sort_indices_temp_storage{}; size_t sort_indices_temp_storage_bytes; + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices, - point_indices + msm_size, sorted_point_indices, total_size * nof_bms); - cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes); + point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream); + cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream); + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices, - point_indices + msm_size, sorted_point_indices, total_size * nof_bms); - cudaFree(sort_indices_temp_storage); + point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream); + cudaFreeAsync(sort_indices_temp_storage, stream); //find bucket_sizes unsigned *single_bucket_indices; unsigned *bucket_sizes; unsigned *total_nof_buckets_to_compute; - cudaMalloc(&single_bucket_indices, sizeof(unsigned)*total_nof_buckets); - cudaMalloc(&bucket_sizes, sizeof(unsigned)*total_nof_buckets); - cudaMalloc(&total_nof_buckets_to_compute, sizeof(unsigned)); + cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*total_nof_buckets, stream); + cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*total_nof_buckets, stream); + cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream); unsigned *encode_temp_storage{}; size_t encode_temp_storage_bytes = 0; cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, - total_nof_buckets_to_compute, nof_bms*total_size); - cudaMalloc(&encode_temp_storage, encode_temp_storage_bytes); + total_nof_buckets_to_compute, nof_bms*total_size, stream); + cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream); cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, - total_nof_buckets_to_compute, nof_bms*total_size); - cudaFree(encode_temp_storage); + total_nof_buckets_to_compute, nof_bms*total_size, stream); + cudaFreeAsync(encode_temp_storage, stream); //get offsets - where does each new bucket begin unsigned* bucket_offsets; - cudaMalloc(&bucket_offsets, sizeof(unsigned)*total_nof_buckets); + cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*total_nof_buckets, stream); unsigned* offsets_temp_storage{}; size_t offsets_temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets); - cudaMalloc(&offsets_temp_storage, offsets_temp_storage_bytes); - cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets); - cudaFree(offsets_temp_storage); + cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream); + cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream); + cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream); + cudaFreeAsync(offsets_temp_storage, stream); //launch the accumulation kernel with maximum threads NUM_THREADS = 1 << 8; NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - accumulate_buckets_kernel<<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, + accumulate_buckets_kernel<<>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize); #ifdef SSM_SUM //sum each bucket NUM_THREADS = 1 << 10; NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); + ssm_buckets_kernel<<>>(buckets, single_bucket_indices, nof_buckets, c); //sum each bucket module P* final_results; - cudaMalloc(&final_results, sizeof(P) * nof_bms); + cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream); NUM_THREADS = 1<>>(buckets, final_results); + sum_reduction_kernel<<>>(buckets, final_results); #endif #ifdef BIG_TRIANGLE P* bm_sums; - cudaMalloc(&bm_sums, sizeof(P) * nof_bms * batch_size); + cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream); //launch the bucket module sum kernel - a thread for each bucket module NUM_THREADS = 1<<8; NUM_BLOCKS = (nof_bms*batch_size + NUM_THREADS - 1) / NUM_THREADS; - big_triangle_sum_kernel<<>>(buckets, bm_sums, nof_bms*batch_size, c); + big_triangle_sum_kernel<<>>(buckets, bm_sums, nof_bms*batch_size, c); #endif P* d_final_results; if (!on_device) - cudaMalloc(&d_final_results, sizeof(P)*batch_size); + cudaMallocAsync(&d_final_results, sizeof(P)*batch_size, stream); //launch the double and add kernel, a single thread for each msm NUM_THREADS = 1<<8; NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS; - final_accumulation_kernel<<>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c); - + final_accumulation_kernel<<>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c); + //copy final result to host - cudaDeviceSynchronize(); if (!on_device) - cudaMemcpy(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost); + cudaMemcpyAsync(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost, stream); //free memory if (!on_device) { - cudaFree(d_points); - cudaFree(d_scalars); - cudaFree(d_final_results); + cudaFreeAsync(d_points, stream); + cudaFreeAsync(d_scalars, stream); + cudaFreeAsync(d_final_results, stream); } - cudaFree(buckets); - cudaFree(bucket_indices); - cudaFree(point_indices); - cudaFree(sorted_bucket_indices); - cudaFree(sorted_point_indices); - cudaFree(single_bucket_indices); - cudaFree(bucket_sizes); - cudaFree(total_nof_buckets_to_compute); - cudaFree(bucket_offsets); - cudaFree(bm_sums); + cudaFreeAsync(buckets, stream); + cudaFreeAsync(bucket_indices, stream); + cudaFreeAsync(point_indices, stream); + cudaFreeAsync(sorted_bucket_indices, stream); + cudaFreeAsync(sorted_point_indices, stream); + cudaFreeAsync(single_bucket_indices, stream); + cudaFreeAsync(bucket_sizes, stream); + cudaFreeAsync(total_nof_buckets_to_compute, stream); + cudaFreeAsync(bucket_offsets, stream); + cudaFreeAsync(bm_sums, stream); + cudaStreamSynchronize(stream); } @@ -456,44 +465,44 @@ __global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N){ //the function computes msm using ssm template -void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result){ //works up to 2^8 +void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream){ //works up to 2^8 S *scalars; A *a_points; P *p_points; P *results; - cudaMalloc(&scalars, sizeof(S) * size); - cudaMalloc(&a_points, sizeof(A) * size); - cudaMalloc(&p_points, sizeof(P) * size); - cudaMalloc(&results, sizeof(P) * size); + cudaMallocAsync(&scalars, sizeof(S) * size, stream); + cudaMallocAsync(&a_points, sizeof(A) * size, stream); + cudaMallocAsync(&p_points, sizeof(P) * size, stream); + cudaMallocAsync(&results, sizeof(P) * size, stream); //copy inputs to device - cudaMemcpy(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice); - cudaMemcpy(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice); + cudaMemcpyAsync(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice, stream); //convert to projective representation and multiply each point by its scalar using single scalar multiplication unsigned NUM_THREADS = size; - to_proj_kernel<<<1,NUM_THREADS>>>(a_points, p_points, size); - ssm_kernel<<<1,NUM_THREADS>>>(scalars, p_points, results, size); + to_proj_kernel<<<1,NUM_THREADS, 0, stream>>>(a_points, p_points, size); + ssm_kernel<<<1,NUM_THREADS, 0, stream>>>(scalars, p_points, results, size); P *final_result; - cudaMalloc(&final_result, sizeof(P)); + cudaMallocAsync(&final_result, sizeof(P), stream); //assuming msm size is a power of 2 //sum all the ssm results NUM_THREADS = size; - sum_reduction_kernel<<<1,NUM_THREADS>>>(results, final_result); + sum_reduction_kernel<<<1,NUM_THREADS, 0, stream>>>(results, final_result); //copy result to host - cudaDeviceSynchronize(); - cudaMemcpy(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost); + cudaStreamSynchronize(stream); + cudaMemcpyAsync(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost, stream); //free memory - cudaFree(scalars); - cudaFree(a_points); - cudaFree(p_points); - cudaFree(results); - cudaFree(final_result); + cudaFreeAsync(scalars, stream); + cudaFreeAsync(a_points, stream); + cudaFreeAsync(p_points, stream); + cudaFreeAsync(results, stream); + cudaFreeAsync(final_result, stream); } @@ -529,21 +538,21 @@ unsigned get_optimal_c(const unsigned size) { //this function is used to compute msms of size larger than 256 template -void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device){ +void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream){ unsigned c = get_optimal_c(size); // unsigned c = 6; // unsigned bitsize = 32; unsigned bitsize = 255; - bucket_method_msm(bitsize, c, scalars, points, size, result, on_device); + bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, stream); } // this function is used to compute a batches of msms of size larger than 256 template -void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device){ +void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream){ unsigned c = get_optimal_c(msm_size); // unsigned c = 6; // unsigned bitsize = 32; unsigned bitsize = 255; - batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device); + batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream); } #endif diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh index 55413b97..c6e8b056 100644 --- a/icicle/appUtils/msm/msm.cuh +++ b/icicle/appUtils/msm/msm.cuh @@ -3,19 +3,19 @@ #pragma once template -void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device); +void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream); template -void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device); +void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream); template -void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device); +void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream); template -void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device); +void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream); template -void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device); +void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream); template void reference_msm(S* scalars, A* a_points, unsigned size); diff --git a/icicle/appUtils/ntt/lde.cu b/icicle/appUtils/ntt/lde.cu index e56ac56d..5fa288eb 100644 --- a/icicle/appUtils/ntt/lde.cu +++ b/icicle/appUtils/ntt/lde.cu @@ -15,19 +15,20 @@ * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial. * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`. */ -template int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) { +template int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { uint32_t logn = uint32_t(log(n) / log(2)); - cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream); int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH); int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1); for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled { - ntt_template_kernel <<>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false); + ntt_template_kernel <<>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false); } NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS; - template_normalize_kernel <<>> (d_out, n * batch_size, S::inv_log_size(logn)); + template_normalize_kernel <<>> (d_out, n * batch_size, S::inv_log_size(logn)); + cudaStreamSynchronize(stream); return 0; } @@ -39,8 +40,8 @@ template int interpolate_batch(E * d_out, E * d_evaluat * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup. * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length). */ -template int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) { - return interpolate_batch (d_out, d_evaluations, d_domain, n, 1); +template int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) { + return interpolate_batch (d_out, d_evaluations, d_domain, n, 1, stream); } template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) { @@ -62,7 +63,7 @@ template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) { * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset. */ template -int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) { +int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) { uint32_t logn = uint32_t(log(domain_size) / log(2)); if (domain_size > n) { // allocate and initialize an array of stream handles to parallelize data copying across batches @@ -80,18 +81,19 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_ cudaStreamDestroy(memcpy_streams[i]); } } else - cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice, stream); if (coset) - batch_vector_mult(coset_powers, d_out, domain_size, batch_size); + batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream); int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH); int chunks = max(int((domain_size / 2) / NUM_THREADS), 1); int NUM_BLOCKS = batch_size * chunks; for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled { - ntt_template_kernel <<>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true); + ntt_template_kernel <<>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true); } + cudaStreamSynchronize(stream); return 0; } @@ -107,76 +109,76 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_ * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset. */ template -int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) { - return evaluate_batch (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers); +int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) { + return evaluate_batch (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers, stream); } template -int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n) { - return interpolate(d_out, d_evaluations, d_domain, n); +int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) { + return interpolate(d_out, d_evaluations, d_domain, n, stream); } template -int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); +int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } template -int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n) { - return interpolate(d_out, d_evaluations, d_domain, n); +int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) { + return interpolate(d_out, d_evaluations, d_domain, n, stream); } template -int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); +int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) { + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } template -int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) { +int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) { S* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } template -int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) { +int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) { S* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } template -int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) { +int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream) { S* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } template int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, unsigned batch_size) { + unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream) { S* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } template int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, S* coset_powers) { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) { + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } template int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, S* coset_powers) { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) { + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } template int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, S* coset_powers) { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) { + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } template int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, S* coset_powers) { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) { + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } #endif \ No newline at end of file diff --git a/icicle/appUtils/ntt/lde.cuh b/icicle/appUtils/ntt/lde.cuh index 0cd914d6..76091771 100644 --- a/icicle/appUtils/ntt/lde.cuh +++ b/icicle/appUtils/ntt/lde.cuh @@ -3,44 +3,44 @@ #pragma once template -int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n); +int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream); template -int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size); +int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream); template -int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n); +int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream); template -int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size); +int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream); template -int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n); +int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream); template -int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size); +int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream); template -int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n); +int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, cudaStream_t stream); template int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, unsigned batch_size); + unsigned domain_size, unsigned n, unsigned batch_size, cudaStream_t stream); template int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, S* coset_powers); + unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream); template int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, S* coset_powers); + unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream); template int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, - unsigned domain_size, unsigned n, S* coset_powers); + unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream); template int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, S* coset_powers); + unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream); #endif \ No newline at end of file diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh index 286f5af1..1c41af45 100644 --- a/icicle/appUtils/ntt/ntt.cuh +++ b/icicle/appUtils/ntt/ntt.cuh @@ -28,11 +28,12 @@ const uint32_t MAX_THREADS_BATCH = 256; * @param n_twiddles number of twiddle factors. * @param omega multiplying factor. */ - template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega) { + template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega, cudaStream_t stream) { size_t size_twiddles = n_twiddles * sizeof(S); S * d_twiddles; - cudaMalloc( & d_twiddles, size_twiddles); - twiddle_factors_kernel <<< 1, 1 >>> (d_twiddles, n_twiddles, omega); + cudaMallocAsync(& d_twiddles, size_twiddles, stream); + twiddle_factors_kernel <<< 1, 1, 0, stream>>> (d_twiddles, n_twiddles, omega); + cudaStreamSynchronize(stream); return d_twiddles; } @@ -89,14 +90,14 @@ template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reve * @param logn log(n). * @param batch_size the size of the batch. */ -template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size) { +template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream) { T* arr_reversed; - cudaMalloc(&arr_reversed, n * batch_size * sizeof(T)); + cudaMallocAsync(&arr_reversed, n * batch_size * sizeof(T), stream); int number_of_threads = MAX_THREADS_BATCH; int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads; - reverse_order_kernel <<>> (arr, arr_reversed, n, logn, batch_size); - cudaMemcpy(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice); - cudaFree(arr_reversed); + reverse_order_kernel <<>> (arr, arr_reversed, n, logn, batch_size); + cudaMemcpyAsync(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice, stream); + cudaFreeAsync(arr_reversed, stream); } /** @@ -107,8 +108,8 @@ template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t lo * @param n length of `arr`. * @param logn log(n). */ -template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn) { - reverse_order_batch(arr, n, logn, 1); +template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn, cudaStream_t stream) { + reverse_order_batch(arr, n, logn, 1, stream); } /** @@ -155,14 +156,15 @@ template < typename E, typename S > __global__ void template_normalize_kernel(E * @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2). * @param n_twiddles length of d_twiddles. */ -template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles) { +template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles, cudaStream_t stream) { uint32_t m = 2; + // TODO: optimize with separate streams for each iteration for (uint32_t s = 0; s < logn; s++) { for (uint32_t i = 0; i < n; i += m) { uint32_t shifted_m = m >> 1; uint32_t number_of_threads = MAX_NUM_THREADS ^ ((shifted_m ^ MAX_NUM_THREADS) & -(shifted_m < MAX_NUM_THREADS)); uint32_t number_of_blocks = shifted_m / MAX_NUM_THREADS + 1; - template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1); + template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks, 0, stream >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1); } m <<= 1; } @@ -177,21 +179,22 @@ template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr * @param n_twiddles length of d_twiddles. * @param inverse indicate if the result array should be normalized by n^(-1). */ -template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_twiddles, uint32_t n_twiddles, bool inverse) { +template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_twiddles, uint32_t n_twiddles, bool inverse, cudaStream_t stream) { uint32_t logn = uint32_t(log(n) / log(2)); size_t size_E = n * sizeof(E); E * arrReversed = template_reverse_order < E > (arr, n, logn); E * d_arrReversed; - cudaMalloc( & d_arrReversed, size_E); - cudaMemcpy(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice); - template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles); + cudaMallocAsync( & d_arrReversed, size_E, stream); + cudaMemcpyAsync(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice, stream); + template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles, stream); if (inverse) { int NUM_THREADS = MAX_NUM_THREADS; int NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS; - template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arrReversed, n, S::inv_log_size(logn)); + template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS, 0, stream >>> (d_arrReversed, n, S::inv_log_size(logn)); } - cudaMemcpy(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost); - cudaFree(d_arrReversed); + cudaMemcpyAsync(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost, stream); + cudaFreeAsync(d_arrReversed, stream); + cudaStreamSynchronize(stream); return arrReversed; } @@ -201,21 +204,22 @@ template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_ * @param n length of d_arr. * @param inverse indicate if the result array should be normalized by n^(-1). */ - template uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse) { + template uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse, cudaStream_t stream) { uint32_t logn = uint32_t(log(n) / log(2)); uint32_t n_twiddles = n; S * twiddles = new S[n_twiddles]; S * d_twiddles; if (inverse){ - d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn)); + d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn), stream); } else{ - d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn)); + d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn), stream); } - E * result = ntt_template < E, S > (arr, n, d_twiddles, n_twiddles, inverse); + E * result = ntt_template < E, S > (arr, n, d_twiddles, n_twiddles, inverse, stream); for(int i = 0; i < n; i++){ arr[i] = result[i]; } - cudaFree(d_twiddles); + cudaFreeAsync(d_twiddles, stream); + cudaStreamSynchronize(stream); return 0; // TODO add } @@ -336,42 +340,45 @@ __global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, u * @param n size of batch. * @param inverse indicate if the result array should be normalized by n^(-1). */ - template uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse) { + template uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse, cudaStream_t stream) { int batches = int(arr_size / n); uint32_t logn = uint32_t(log(n) / log(2)); uint32_t n_twiddles = n; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. size_t size_E = arr_size * sizeof(E); S * d_twiddles; if (inverse){ - d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn)); + d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn), stream); } else{ - d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn)); + d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn), stream); } E * d_arr; - cudaMalloc( & d_arr, size_E); - cudaMemcpy(d_arr, arr, size_E, cudaMemcpyHostToDevice); + cudaMallocAsync( & d_arr, size_E, stream); + cudaMemcpyAsync(d_arr, arr, size_E, cudaMemcpyHostToDevice, stream); int NUM_THREADS = MAX_THREADS_BATCH; int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS; - ntt_template_kernel_rev_ord<<>>(d_arr, n, logn, batches); + ntt_template_kernel_rev_ord<<>>(d_arr, n, logn, batches); NUM_THREADS = min(n / 2, MAX_THREADS_BATCH); int chunks = max(int((n / 2) / NUM_THREADS), 1); int total_tasks = batches * chunks; NUM_BLOCKS = total_tasks; - for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled + //TODO: this loop also can be unrolled + for (uint32_t s = 0; s < logn; s++) { - ntt_template_kernel<<>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false); + ntt_template_kernel<<>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false); + cudaStreamSynchronize(stream); } if (inverse == true) { NUM_THREADS = MAX_NUM_THREADS; NUM_BLOCKS = (arr_size + NUM_THREADS - 1) / NUM_THREADS; - template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, arr_size, S::inv_log_size(logn)); + template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS, 0, stream>>> (d_arr, arr_size, S::inv_log_size(logn)); } - cudaMemcpy(arr, d_arr, size_E, cudaMemcpyDeviceToHost); - cudaFree(d_arr); - cudaFree(d_twiddles); + cudaMemcpyAsync(arr, d_arr, size_E, cudaMemcpyDeviceToHost, stream); + cudaFreeAsync(d_arr, stream); + cudaFreeAsync(d_twiddles, stream); + cudaStreamSynchronize(stream); return 0; } diff --git a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh index d566ca58..6bbf9a40 100644 --- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh +++ b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh @@ -19,7 +19,7 @@ __global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n } template -int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in place so no need for third result vector +int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments, cudaStream_t stream) // TODO: in place so no need for third result vector { // Set the grid and block dimensions int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); @@ -28,23 +28,24 @@ int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments) // TODO: in // Allocate memory on the device for the input vectors, the output vector, and the modulus S *d_vec_a; E *d_vec_b, *d_result; - cudaMalloc(&d_vec_a, n_elments * sizeof(S)); - cudaMalloc(&d_vec_b, n_elments * sizeof(E)); - cudaMalloc(&d_result, n_elments * sizeof(E)); + cudaMallocAsync(&d_vec_a, n_elments * sizeof(S), stream); + cudaMallocAsync(&d_vec_b, n_elments * sizeof(E), stream); + cudaMallocAsync(&d_result, n_elments * sizeof(E), stream); // Copy the input vectors and the modulus from the host to the device - cudaMemcpy(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice); - cudaMemcpy(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_vec_a, vec_a, n_elments * sizeof(S), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_vec_b, vec_b, n_elments * sizeof(E), cudaMemcpyHostToDevice, stream); // Call the kernel to perform element-wise modular multiplication - vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); + vectorModMult<<>>(d_vec_a, d_vec_b, d_result, n_elments); - cudaMemcpy(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(result, d_result, n_elments * sizeof(E), cudaMemcpyDeviceToHost, stream); - cudaFree(d_vec_a); - cudaFree(d_vec_b); - cudaFree(d_result); + cudaFreeAsync(d_vec_a, stream); + cudaFreeAsync(d_vec_b, stream); + cudaFreeAsync(d_result, stream); + cudaStreamSynchronize(stream); return 0; } @@ -60,12 +61,12 @@ __global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalar } template -int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size) +int batch_vector_mult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size, cudaStream_t stream) { // Set the grid and block dimensions int NUM_THREADS = MAX_THREADS_PER_BLOCK; int NUM_BLOCKS = (n_scalars * batch_size + NUM_THREADS - 1) / NUM_THREADS; - batchVectorMult<<>>(scalar_vec, element_vec, n_scalars, batch_size); + batchVectorMult<<>>(scalar_vec, element_vec, n_scalars, batch_size); return 0; } @@ -83,7 +84,7 @@ __global__ void matrixVectorMult(E *matrix_elements, E *vector_elements, E *resu } template -int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim) +int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t dim, cudaStream_t stream) { // Set the grid and block dimensions int num_blocks = (int)ceil((float)dim / MAX_THREADS_PER_BLOCK); @@ -91,23 +92,24 @@ int matrix_mod_mult(E *matrix_elements, E *vector_elements, E *result, size_t di // Allocate memory on the device for the input vectors, the output vector, and the modulus E *d_matrix, *d_vector, *d_result; - cudaMalloc(&d_matrix, (dim * dim) * sizeof(E)); - cudaMalloc(&d_vector, dim * sizeof(E)); - cudaMalloc(&d_result, dim * sizeof(E)); + cudaMallocAsync(&d_matrix, (dim * dim) * sizeof(E), stream); + cudaMallocAsync(&d_vector, dim * sizeof(E), stream); + cudaMallocAsync(&d_result, dim * sizeof(E), stream); // Copy the input vectors and the modulus from the host to the device - cudaMemcpy(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice); - cudaMemcpy(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice); + cudaMemcpyAsync(d_matrix, matrix_elements, (dim * dim) * sizeof(E), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_vector, vector_elements, dim * sizeof(E), cudaMemcpyHostToDevice, stream); // Call the kernel to perform element-wise modular multiplication - matrixVectorMult<<>>(d_matrix, d_vector, d_result, dim); + matrixVectorMult<<>>(d_matrix, d_vector, d_result, dim); - cudaMemcpy(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(result, d_result, dim * sizeof(E), cudaMemcpyDeviceToHost, stream); - cudaFree(d_matrix); - cudaFree(d_vector); - cudaFree(d_result); + cudaFreeAsync(d_matrix, stream); + cudaFreeAsync(d_vector, stream); + cudaFreeAsync(d_result, stream); + cudaStreamSynchronize(stream); return 0; } #endif \ No newline at end of file diff --git a/icicle/curves/bls12_377/lde.cu b/icicle/curves/bls12_377/lde.cu index 10649eb8..439e660f 100644 --- a/icicle/curves/bls12_377/lde.cu +++ b/icicle/curves/bls12_377/lde.cu @@ -6,14 +6,15 @@ #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh" #include "curve_config.cuh" -extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0) +extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { + cudaStreamCreate(&stream); if (inverse) { - return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega_inv(logn)); + return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega_inv(logn), stream); } else { - return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega(logn)); + return fill_twiddle_factors_array(domain_size, BLS12_377::scalar_t::omega(logn), stream); } } catch (const std::runtime_error &ex) @@ -23,11 +24,12 @@ extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size } } -extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inv } } -extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bo } } -extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t arr_s } } -extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t } } -extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, unsigned device_id = 0) +extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BL } extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_evaluations, BLS12_377::scalar_t* d_domain, unsigned n, - unsigned batch_size, size_t device_id = 0) + unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_o } } -extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, size_t device_id = 0) +extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_evaluations, BLS12_377::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, } extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_evaluations, BLS12_377::scalar_t* d_domain, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t* } extern "C" int evaluate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_coefficients, BLS12_377::scalar_t *d_domain, - unsigned domain_size, unsigned n, unsigned device_id = 0) + unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { BLS12_377::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12 } extern "C" int evaluate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_377::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, } extern "C" int evaluate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_coefficients, BLS12_377::scalar_t *d_domain, - unsigned domain_size, unsigned n, size_t device_id = 0) + unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_377::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bls12_377(BLS12_377::projective_t* d_out, BL } extern "C" int evaluate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_377::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bls12_377(BLS12_377::projective_t* d_o } extern "C" int evaluate_scalars_on_coset_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t *d_coefficients, BLS12_377::scalar_t *d_domain, unsigned domain_size, - unsigned n, BLS12_377::scalar_t *coset_powers, unsigned device_id = 0) + unsigned n, BLS12_377::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bls12_377(BLS12_377::scalar_t* d_o } extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_377(BLS12_377::scalar_t* d_out, BLS12_377::scalar_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_377(BLS12_377::scalar_ } extern "C" int evaluate_points_on_coset_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t *d_coefficients, BLS12_377::scalar_t *d_domain, unsigned domain_size, - unsigned n, BLS12_377::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bls12_377(BLS12_377::projective_t* } extern "C" int evaluate_points_on_coset_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::projective_t* d_coefficients, BLS12_377::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BLS12_377::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bls12_377(BLS12_377::projecti } } -extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bls12_377(BLS12_377::scalar_t* arr, in } } -extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) @@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* a } } -extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bls12_377(BLS12_377::projective_t* arr, } } -extern "C" int reverse_order_points_batch_cuda_bls12_377(BLS12_377::projective_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_points_batch_cuda_bls12_377(BLS12_377::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_377/msm.cu b/icicle/curves/bls12_377/msm.cu index b20b463a..73332ddb 100644 --- a/icicle/curves/bls12_377/msm.cu +++ b/icicle/curves/bls12_377/msm.cu @@ -8,11 +8,11 @@ extern "C" int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[], - BLS12_377::scalar_t scalars[], size_t count, size_t device_id = 0) + BLS12_377::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(scalars, points, count, out, false); + large_msm(scalars, points, count, out, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -23,11 +23,14 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[ } extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377::affine_t points[], - BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0) + BLS12_377::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(scalars, points, batch_size, msm_size, out, false); + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); + return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -46,11 +49,12 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377: * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). */ extern "C" - int commit_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t device_id = 0) + int commit_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(d_scalars, d_points, count, d_out, true); + large_msm(d_scalars, d_points, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) @@ -70,11 +74,13 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377: * @param batch_size Size of the batch. */ extern "C" - int commit_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0) + int commit_batch_cuda_bls12_377(BLS12_377::projective_t* d_out, BLS12_377::scalar_t* d_scalars, BLS12_377::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true); + cudaStreamCreate(&stream); + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_377/ve_mod_mult.cu b/icicle/curves/bls12_377/ve_mod_mult.cu index 05802c4e..2d25d488 100644 --- a/icicle/curves/bls12_377/ve_mod_mult.cu +++ b/icicle/curves/bls12_377/ve_mod_mult.cu @@ -12,12 +12,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_377(BLS12_377::projective_t *inout, BLS12_377::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -30,12 +31,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_377(BLS12_377::projective_t *inout, extern "C" int32_t vec_mod_mult_scalar_bls12_377(BLS12_377::scalar_t *inout, BLS12_377::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -49,12 +51,13 @@ extern "C" int32_t matrix_vec_mod_mult_bls12_377(BLS12_377::scalar_t *matrix_fla BLS12_377::scalar_t *input, BLS12_377::scalar_t *output, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - matrix_mod_mult(matrix_flattened, input, output, n_elments); + matrix_mod_mult(matrix_flattened, input, output, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_381/lde.cu b/icicle/curves/bls12_381/lde.cu index c061b074..f395602f 100644 --- a/icicle/curves/bls12_381/lde.cu +++ b/icicle/curves/bls12_381/lde.cu @@ -6,14 +6,15 @@ #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh" #include "curve_config.cuh" -extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0) +extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { + cudaStreamCreate(&stream); if (inverse) { - return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn)); + return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega_inv(logn), stream); } else { - return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn)); + return fill_twiddle_factors_array(domain_size, BLS12_381::scalar_t::omega(logn), stream); } } catch (const std::runtime_error &ex) @@ -23,11 +24,12 @@ extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size } } -extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inv } } -extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bo } } -extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t arr_s } } -extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t } } -extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, unsigned device_id = 0) +extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BL } extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_evaluations, BLS12_381::scalar_t* d_domain, unsigned n, - unsigned batch_size, size_t device_id = 0) + unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_o } } -extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, size_t device_id = 0) +extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_evaluations, BLS12_381::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, } extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_evaluations, BLS12_381::scalar_t* d_domain, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t* } extern "C" int evaluate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain, - unsigned domain_size, unsigned n, unsigned device_id = 0) + unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { BLS12_381::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12 } extern "C" int evaluate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_381::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, } extern "C" int evaluate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain, - unsigned domain_size, unsigned n, size_t device_id = 0) + unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_381::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bls12_381(BLS12_381::projective_t* d_out, BL } extern "C" int evaluate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BLS12_381::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bls12_381(BLS12_381::projective_t* d_o } extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size, - unsigned n, BLS12_381::scalar_t *coset_powers, unsigned device_id = 0) + unsigned n, BLS12_381::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bls12_381(BLS12_381::scalar_t* d_o } extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381::scalar_t* d_out, BLS12_381::scalar_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bls12_381(BLS12_381::scalar_ } extern "C" int evaluate_points_on_coset_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t *d_coefficients, BLS12_381::scalar_t *d_domain, unsigned domain_size, - unsigned n, BLS12_381::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bls12_381(BLS12_381::projective_t* } extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::projective_t* d_coefficients, BLS12_381::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BLS12_381::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bls12_381(BLS12_381::projecti } } -extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bls12_381(BLS12_381::scalar_t* arr, in } } -extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) @@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* a } } -extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bls12_381(BLS12_381::projective_t* arr, } } -extern "C" int reverse_order_points_batch_cuda_bls12_381(BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_points_batch_cuda_bls12_381(BLS12_381::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_381/msm.cu b/icicle/curves/bls12_381/msm.cu index 91103324..c32efa45 100644 --- a/icicle/curves/bls12_381/msm.cu +++ b/icicle/curves/bls12_381/msm.cu @@ -8,11 +8,11 @@ extern "C" int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[], - BLS12_381::scalar_t scalars[], size_t count, size_t device_id = 0) + BLS12_381::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(scalars, points, count, out, false); + large_msm(scalars, points, count, out, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -23,12 +23,13 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[ } extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381::affine_t points[], - BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0) + BLS12_381::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(scalars, points, batch_size, msm_size, out, false); - + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -47,11 +48,12 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381: * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). */ extern "C" - int commit_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t device_id = 0) + int commit_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(d_scalars, d_points, count, d_out, true); + large_msm(d_scalars, d_points, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) @@ -71,11 +73,13 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381: * @param batch_size Size of the batch. */ extern "C" - int commit_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0) + int commit_batch_cuda_bls12_381(BLS12_381::projective_t* d_out, BLS12_381::scalar_t* d_scalars, BLS12_381::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try - { - batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true); + { + cudaStreamCreate(&stream); + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bls12_381/ve_mod_mult.cu b/icicle/curves/bls12_381/ve_mod_mult.cu index 26929aa7..73075e8e 100644 --- a/icicle/curves/bls12_381/ve_mod_mult.cu +++ b/icicle/curves/bls12_381/ve_mod_mult.cu @@ -11,12 +11,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_381(BLS12_381::projective_t *inout, BLS12_381::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -29,12 +30,13 @@ extern "C" int32_t vec_mod_mult_point_bls12_381(BLS12_381::projective_t *inout, extern "C" int32_t vec_mod_mult_scalar_bls12_381(BLS12_381::scalar_t *inout, BLS12_381::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -48,12 +50,13 @@ extern "C" int32_t matrix_vec_mod_mult_bls12_381(BLS12_381::scalar_t *matrix_fla BLS12_381::scalar_t *input, BLS12_381::scalar_t *output, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - matrix_mod_mult(matrix_flattened, input, output, n_elments); + matrix_mod_mult(matrix_flattened, input, output, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bn254/lde.cu b/icicle/curves/bn254/lde.cu index 559b7e4d..3aec2ad9 100644 --- a/icicle/curves/bn254/lde.cu +++ b/icicle/curves/bn254/lde.cu @@ -6,14 +6,15 @@ #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh" #include "curve_config.cuh" -extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0) +extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { + cudaStreamCreate(&stream); if (inverse) { - return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega_inv(logn)); + return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega_inv(logn), stream); } else { - return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega(logn)); + return fill_twiddle_factors_array(domain_size, BN254::scalar_t::omega(logn), stream); } } catch (const std::runtime_error &ex) @@ -23,11 +24,12 @@ extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32 } } -extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -37,11 +39,12 @@ extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, si } } -extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inver } } -extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_bn254(BN254::scalar_t *arr, uint32_t arr_size, uin } } -extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_bn254(BN254::projective_t *arr, uint32_t arr_siz } } -extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, unsigned device_id = 0) +extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::sca } extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_evaluations, BN254::scalar_t* d_domain, unsigned n, - unsigned batch_size, size_t device_id = 0) + unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN25 } } -extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, size_t device_id = 0) +extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254:: } extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_evaluations, BN254::scalar_t* d_domain, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, B } extern "C" int evaluate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_coefficients, BN254::scalar_t *d_domain, - unsigned domain_size, unsigned n, unsigned device_id = 0) + unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { BN254::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar } extern "C" int evaluate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BN254::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN254:: } extern "C" int evaluate_points_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_coefficients, BN254::scalar_t *d_domain, - unsigned domain_size, unsigned n, size_t device_id = 0) + unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { BN254::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_bn254(BN254::projective_t* d_out, BN254::pro } extern "C" int evaluate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { BN254::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_bn254(BN254::projective_t* d_out, BN25 } extern "C" int evaluate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_coefficients, BN254::scalar_t *d_domain, unsigned domain_size, - unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0) + unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN25 } extern "C" int evaluate_scalars_on_coset_batch_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_bn254(BN254::scalar_t* d_out } extern "C" int evaluate_points_on_coset_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t *d_coefficients, BN254::scalar_t *d_domain, unsigned domain_size, - unsigned n, BN254::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_bn254(BN254::projective_t* d_out, B } extern "C" int evaluate_points_on_coset_batch_cuda_bn254(BN254::projective_t* d_out, BN254::projective_t* d_coefficients, BN254::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, BN254::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_bn254(BN254::projective_t* d_ } } -extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, siz } } -extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) @@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_bn254(BN254::scalar_t* arr, int } } -extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_bn254(BN254::projective_t* arr, int n, } } -extern "C" int reverse_order_points_batch_cuda_bn254(BN254::projective_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_points_batch_cuda_bn254(BN254::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bn254/msm.cu b/icicle/curves/bn254/msm.cu index 26fa4cd3..8de1c4bb 100644 --- a/icicle/curves/bn254/msm.cu +++ b/icicle/curves/bn254/msm.cu @@ -8,11 +8,11 @@ extern "C" int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[], - BN254::scalar_t scalars[], size_t count, size_t device_id = 0) + BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(scalars, points, count, out, false); + large_msm(scalars, points, count, out, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -23,11 +23,13 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[], } extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t points[], - BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0) + BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(scalars, points, batch_size, msm_size, out, false); + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -46,11 +48,12 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). */ extern "C" - int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0) + int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(d_scalars, d_points, count, d_out, true); + large_msm(d_scalars, d_points, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) @@ -70,11 +73,13 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po * @param batch_size Size of the batch. */ extern "C" - int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0) + int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true); + cudaStreamCreate(&stream); + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/bn254/ve_mod_mult.cu b/icicle/curves/bn254/ve_mod_mult.cu index b86a08b0..6acef1fa 100644 --- a/icicle/curves/bn254/ve_mod_mult.cu +++ b/icicle/curves/bn254/ve_mod_mult.cu @@ -12,14 +12,15 @@ extern "C" int32_t vec_mod_mult_point_bn254(BN254::projective_t *inout, BN254::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { // TODO: use device_id when working with multiple devices (void)device_id; try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -32,14 +33,15 @@ extern "C" int32_t vec_mod_mult_point_bn254(BN254::projective_t *inout, extern "C" int32_t vec_mod_mult_scalar_bn254(BN254::scalar_t *inout, BN254::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { // TODO: use device_id when working with multiple devices (void)device_id; try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -53,14 +55,15 @@ extern "C" int32_t matrix_vec_mod_mult_bn254(BN254::scalar_t *matrix_flattened, BN254::scalar_t *input, BN254::scalar_t *output, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { // TODO: use device_id when working with multiple devices (void)device_id; try { // TODO: device_id - matrix_mod_mult(matrix_flattened, input, output, n_elments); + matrix_mod_mult(matrix_flattened, input, output, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/curve_template/lde.cu b/icicle/curves/curve_template/lde.cu index d665d1bb..2c998932 100644 --- a/icicle/curves/curve_template/lde.cu +++ b/icicle/curves/curve_template/lde.cu @@ -6,14 +6,15 @@ #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh" #include "curve_config.cuh" -extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0) +extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { + cudaStreamCreate(&stream); if (inverse) { - return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega_inv(logn)); + return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega_inv(logn), stream); } else { - return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega(logn)); + return fill_twiddle_factors_array(domain_size, CURVE_NAME_U::scalar_t::omega(logn), stream); } } catch (const std::runtime_error &ex) @@ -23,11 +24,12 @@ extern "C" CURVE_NAME_U::scalar_t* build_domain_cuda_CURVE_NAME_L(uint32_t domai } } -extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -37,11 +39,12 @@ extern "C" int ntt_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t n, bo } } -extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0) +extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_template(arr, n, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_template(arr, n, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -50,11 +53,12 @@ extern "C" int ecntt_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t } } -extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -63,11 +67,12 @@ extern "C" int ntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t *arr, uint32_t } } -extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0) +extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id = 0, cudaStream_t stream = 0) { try { - return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse); // TODO: pass device_id + cudaStreamCreate(&stream); + return ntt_end2end_batch_template(arr, arr_size, batch_size, inverse, stream); // TODO: pass device_id } catch (const std::runtime_error &ex) { @@ -76,11 +81,11 @@ extern "C" int ecntt_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *arr, ui } } -extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, unsigned device_id = 0) +extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -90,11 +95,12 @@ extern "C" int interpolate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_o } extern "C" int interpolate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_evaluations, CURVE_NAME_U::scalar_t* d_domain, unsigned n, - unsigned batch_size, size_t device_id = 0) + unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -103,11 +109,11 @@ extern "C" int interpolate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_ } } -extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, size_t device_id = 0) +extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_evaluations, CURVE_NAME_U::scalar_t *d_domain, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate(d_out, d_evaluations, d_domain, n); + return interpolate(d_out, d_evaluations, d_domain, n, stream); } catch (const std::runtime_error &ex) { @@ -117,11 +123,12 @@ extern "C" int interpolate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* } extern "C" int interpolate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_evaluations, CURVE_NAME_U::scalar_t* d_domain, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size); + cudaStreamCreate(&stream); + return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream); } catch (const std::runtime_error &ex) { @@ -131,12 +138,13 @@ extern "C" int interpolate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projecti } extern "C" int evaluate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, - unsigned domain_size, unsigned n, unsigned device_id = 0) + unsigned domain_size, unsigned n, unsigned device_id = 0, cudaStream_t stream = 0) { try { CURVE_NAME_U::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -146,12 +154,13 @@ extern "C" int evaluate_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, } extern "C" int evaluate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { CURVE_NAME_U::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -161,12 +170,13 @@ extern "C" int evaluate_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* } extern "C" int evaluate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, - unsigned domain_size, unsigned n, size_t device_id = 0) + unsigned domain_size, unsigned n, size_t device_id = 0, cudaStream_t stream = 0) { try { CURVE_NAME_U::scalar_t* _null = nullptr; - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -176,12 +186,13 @@ extern "C" int evaluate_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_o } extern "C" int evaluate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, size_t device_id = 0) + unsigned n, unsigned batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { CURVE_NAME_U::scalar_t* _null = nullptr; - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream); } catch (const std::runtime_error &ex) { @@ -191,11 +202,12 @@ extern "C" int evaluate_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_ } extern "C" int evaluate_scalars_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, unsigned domain_size, - unsigned n, CURVE_NAME_U::scalar_t *coset_powers, unsigned device_id = 0) + unsigned n, CURVE_NAME_U::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -205,11 +217,12 @@ extern "C" int evaluate_scalars_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_ } extern "C" int evaluate_scalars_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* d_out, CURVE_NAME_U::scalar_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -219,11 +232,12 @@ extern "C" int evaluate_scalars_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::s } extern "C" int evaluate_points_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t *d_coefficients, CURVE_NAME_U::scalar_t *d_domain, unsigned domain_size, - unsigned n, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -233,11 +247,12 @@ extern "C" int evaluate_points_on_coset_cuda_CURVE_NAME_L(CURVE_NAME_U::projecti } extern "C" int evaluate_points_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::projective_t* d_coefficients, CURVE_NAME_U::scalar_t* d_domain, unsigned domain_size, - unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0) + unsigned n, unsigned batch_size, CURVE_NAME_U::scalar_t *coset_powers, size_t device_id = 0, cudaStream_t stream = 0) { try { - return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers); + cudaStreamCreate(&stream); + return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers, stream); } catch (const std::runtime_error &ex) { @@ -246,12 +261,13 @@ extern "C" int evaluate_points_on_coset_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::pr } } -extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -261,12 +277,13 @@ extern "C" int reverse_order_scalars_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* a } } -extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scalar_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) @@ -276,12 +293,13 @@ extern "C" int reverse_order_scalars_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::scala } } -extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, size_t device_id = 0) +extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order(arr, n, logn); + cudaStreamCreate(&stream); + reverse_order(arr, n, logn, stream); return 0; } catch (const std::runtime_error &ex) @@ -291,12 +309,13 @@ extern "C" int reverse_order_points_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t } } -extern "C" int reverse_order_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, int batch_size, size_t device_id = 0) +extern "C" int reverse_order_points_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* arr, int n, int batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { uint32_t logn = uint32_t(log(n) / log(2)); - reverse_order_batch(arr, n, logn, batch_size); + cudaStreamCreate(&stream); + reverse_order_batch(arr, n, logn, batch_size, stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/curve_template/msm.cu b/icicle/curves/curve_template/msm.cu index 891a6311..f662599d 100644 --- a/icicle/curves/curve_template/msm.cu +++ b/icicle/curves/curve_template/msm.cu @@ -8,11 +8,11 @@ extern "C" int msm_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *out, CURVE_NAME_U::affine_t points[], - CURVE_NAME_U::scalar_t scalars[], size_t count, size_t device_id = 0) + CURVE_NAME_U::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(scalars, points, count, out, false); + large_msm(scalars, points, count, out, false, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -23,11 +23,13 @@ int msm_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t *out, CURVE_NAME_U::affine_ } extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURVE_NAME_U::affine_t points[], - CURVE_NAME_U::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0) + CURVE_NAME_U::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(scalars, points, batch_size, msm_size, out, false); + cudaStreamCreate(&stream); + batched_large_msm(scalars, points, batch_size, msm_size, out, false, stream); + cudaStreamSynchronize(stream); return CUDA_SUCCESS; } @@ -47,11 +49,12 @@ extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURV * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length). */ extern "C" - int commit_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t device_id = 0) + int commit_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0) { try { - large_msm(d_scalars, d_points, count, d_out, true); + large_msm(d_scalars, d_points, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) @@ -71,11 +74,13 @@ extern "C" int msm_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* out, CURV * @param batch_size Size of the batch. */ extern "C" - int commit_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0) + int commit_batch_cuda_CURVE_NAME_L(CURVE_NAME_U::projective_t* d_out, CURVE_NAME_U::scalar_t* d_scalars, CURVE_NAME_U::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0) { try { - batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true); + cudaStreamCreate(&stream); + batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream); + cudaStreamSynchronize(stream); return 0; } catch (const std::runtime_error &ex) diff --git a/icicle/curves/curve_template/ve_mod_mult.cu b/icicle/curves/curve_template/ve_mod_mult.cu index af40b805..b757d206 100644 --- a/icicle/curves/curve_template/ve_mod_mult.cu +++ b/icicle/curves/curve_template/ve_mod_mult.cu @@ -12,12 +12,13 @@ extern "C" int32_t vec_mod_mult_point_CURVE_NAME_L(CURVE_NAME_U::projective_t *inout, CURVE_NAME_U::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -30,12 +31,13 @@ extern "C" int32_t vec_mod_mult_point_CURVE_NAME_L(CURVE_NAME_U::projective_t *i extern "C" int32_t vec_mod_mult_scalar_CURVE_NAME_L(CURVE_NAME_U::scalar_t *inout, CURVE_NAME_U::scalar_t *scalar_vec, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - vector_mod_mult(scalar_vec, inout, inout, n_elments); + vector_mod_mult(scalar_vec, inout, inout, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex) @@ -49,12 +51,13 @@ extern "C" int32_t matrix_vec_mod_mult_CURVE_NAME_L(CURVE_NAME_U::scalar_t *matr CURVE_NAME_U::scalar_t *input, CURVE_NAME_U::scalar_t *output, size_t n_elments, - size_t device_id) + size_t device_id, + cudaStream_t stream = 0) { try { // TODO: device_id - matrix_mod_mult(matrix_flattened, input, output, n_elments); + matrix_mod_mult(matrix_flattened, input, output, n_elments, stream); return CUDA_SUCCESS; } catch (const std::runtime_error &ex)