diff --git a/examples/c++/ntt/README.md b/examples/c++/ntt/README.md index 5e27640b..28e8dd45 100644 --- a/examples/c++/ntt/README.md +++ b/examples/c++/ntt/README.md @@ -16,10 +16,11 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s // Include NTT template #include "appUtils/ntt/ntt.cu" using namespace curve_config; +using namespace ntt; // Configure NTT -ntt::NTTConfig config=ntt::DefaultNTTConfig(); +NTTConfig config=DefaultNTTConfig(); // Call NTT -ntt::NTT(input, ntt_size, ntt::NTTDir::kForward, config, output); +NTT(input, ntt_size, NTTDir::kForward, config, output); ``` ## Running the example @@ -28,5 +29,10 @@ ntt::NTT(input, ntt_size, ntt::NTTDir::kForward, config, output); - compile with `./compile.sh` - run with `./run.sh` +## What's in the example - +1. Define the size of the example +2. Initialize input +3. Run Radix2 NTT +4. Run MixedRadix NTT +5. Validate the data output diff --git a/examples/c++/ntt/example.cu b/examples/c++/ntt/example.cu index b72cc555..a01a54f9 100644 --- a/examples/c++/ntt/example.cu +++ b/examples/c++/ntt/example.cu @@ -7,6 +7,7 @@ #include "appUtils/ntt/ntt.cu" #include "appUtils/ntt/kernel_ntt.cu" using namespace curve_config; +using namespace ntt; // Operate on scalars typedef scalar_t S; @@ -58,6 +59,11 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element return nof_errors; } +using FpMilliseconds = std::chrono::duration; +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + + int main(int argc, char* argv[]) { std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl; @@ -78,24 +84,30 @@ int main(int argc, char* argv[]) output = (E*)malloc(sizeof(E) * batch_size); std::cout << "Running NTT with on-host data" << std::endl; - cudaStream_t stream; - cudaStreamCreate(&stream); // Create a device context auto ctx = device_context::get_default_device_context(); - // the next line is valid only for CURVE_ID 1 (will add support for other curves soon) - S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}}; - ntt::InitDomain(rou, ctx); + const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/); + InitDomain(basic_root, ctx); // Create an NTTConfig instance - ntt::NTTConfig config = ntt::DefaultNTTConfig(); + NTTConfig config = DefaultNTTConfig(); + config.ntt_algorithm = NttAlgorithm::MixedRadix; config.batch_size = nof_ntts; - config.ctx.stream = stream; - auto begin0 = std::chrono::high_resolution_clock::now(); - cudaError_t err = ntt::NTT(input, ntt_size, ntt::NTTDir::kForward, config, output); - auto end0 = std::chrono::high_resolution_clock::now(); - auto elapsed0 = std::chrono::duration_cast(end0 - begin0); - printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9); + START_TIMER(MixedRadix); + cudaError_t err = NTT(input, ntt_size, NTTDir::kForward, config, output); + END_TIMER(MixedRadix, "MixedRadix NTT"); + + std::cout << "Validating output" << std::endl; validate_output(ntt_size, nof_ntts, output); - cudaStreamDestroy(stream); + + config.ntt_algorithm = NttAlgorithm::Radix2; + START_TIMER(Radix2); + err = NTT(input, ntt_size, NTTDir::kForward, config, output); + END_TIMER(Radix2, "Radix2 NTT"); + + std::cout << "Validating output" << std::endl; + validate_output(ntt_size, nof_ntts, output); + + std::cout << "Cleaning-up memory" << std::endl; free(input); free(output); return 0; diff --git a/icicle/appUtils/msm/Makefile b/icicle/appUtils/msm/Makefile index adc42974..e838fd7f 100644 --- a/icicle/appUtils/msm/Makefile +++ b/icicle/appUtils/msm/Makefile @@ -1,4 +1,4 @@ test_msm: mkdir -p work - nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu + nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu work/test_msm \ No newline at end of file diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu index 7c2e7956..0b0392c6 100644 --- a/icicle/appUtils/msm/msm.cu +++ b/icicle/appUtils/msm/msm.cu @@ -30,6 +30,53 @@ namespace msm { unsigned get_optimal_c(int bitsize) { return max((unsigned)ceil(log2(bitsize)) - 4, 1U); } + template + __global__ void normalize_kernel(E* inout, E factor, int n) + { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) inout[tid] = (inout[tid] + factor - 1) / factor; + } + + // a kernel that writes to bucket_indices which enables large bucket accumulation to happen afterwards. + // specifically we map thread indices to buckets which said threads will handle in accumulation. + template + __global__ void initialize_large_bucket_indices( + unsigned* sorted_bucket_sizes_sum, + unsigned nof_pts_per_thread, + unsigned nof_large_buckets, + // log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute)) + unsigned log_nof_large_buckets, + unsigned* bucket_indices) + { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= nof_large_buckets) { return; } + unsigned start = (sorted_bucket_sizes_sum[tid] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid; + unsigned end = (sorted_bucket_sizes_sum[tid + 1] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid + 1; + for (unsigned i = start; i < end; i++) { + // this just concatenates two pieces of data - large bucket index and (i - start) + bucket_indices[i] = tid | ((i - start) << log_nof_large_buckets); + } + } + + // this function provides a single step of reduction across buckets sizes of + // which are given by large_bucket_sizes pointer + template + __global__ void sum_reduction_variable_size_kernel( + P* v, + unsigned* bucket_sizes_sum, + unsigned* bucket_sizes, + unsigned* large_bucket_thread_indices, + unsigned num_of_threads) + { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_of_threads) { return; } + + unsigned large_bucket_tid = large_bucket_thread_indices[tid]; + unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid; + unsigned large_bucket_size = bucket_sizes[large_bucket_tid]; + if (segment_ind < (large_bucket_size >> 1)) { v[tid] = v[tid] + v[tid + ((large_bucket_size + 1) >> 1)]; } + } + template __global__ void single_stage_multi_reduction_kernel( const P* v, @@ -37,23 +84,27 @@ namespace msm { unsigned block_size, unsigned write_stride, unsigned write_phase, - unsigned padding, + unsigned step, unsigned num_of_threads) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= num_of_threads) { return; } + // we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them. + // for write_phase==1, the read pattern is different so we don't skip over anything. + const int shifted_tid = write_phase ? tid : tid + (tid + step) / step; const int jump = block_size / 2; - const int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid; - const int block_id = tid_p / jump; - const int block_tid = tid_p % jump; - const unsigned read_ind = block_size * block_id + block_tid; - const unsigned write_ind = tid; + const int block_id = shifted_tid / jump; + // here the reason for shifting is the same as for shifted_tid but we skip over entire blocks which happens + // only for write_phase=1 because of its read pattern. + const int shifted_block_id = write_phase ? block_id + (block_id + step) / step : block_id; + const int block_tid = shifted_tid % jump; + const unsigned read_ind = block_size * shifted_block_id + block_tid; + const unsigned write_ind = jump * shifted_block_id + block_tid; const unsigned v_r_key = write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind; - v_r[v_r_key] = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero() - : v[read_ind] + v[read_ind + jump]; + v_r[v_r_key] = v[read_ind] + v[read_ind + jump]; } // this kernel performs single scalar multiplication @@ -135,8 +186,8 @@ namespace msm { buckets_indices[current_index] = (msm_index << (c + bm_bitsize)) | (bm << c) | bucket_index; // the bucket module number and the msm number are appended at the msbs - if (scalar == S::zero() || bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped - point_indices[current_index] = tid % points_size; // the point index is saved for later + if (bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped + point_indices[current_index] = tid % points_size; // the point index is saved for later #endif } } @@ -158,19 +209,6 @@ namespace msm { if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; } } - template - __global__ void - find_max_size(unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, unsigned* largest_bucket_size) - { - for (int i = 0;; i++) { - if (single_bucket_indices[i] & ((1 << c) - 1)) { - largest_bucket_size[0] = bucket_sizes[i]; - largest_bucket_size[1] = i; - break; - } - } - } - // this kernel adds up the points in each bucket template __global__ void accumulate_buckets_kernel( @@ -188,9 +226,6 @@ namespace msm { // constexpr unsigned sign_mask = 0x80000000; unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nof_buckets_to_compute) return; - if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) { - return; // skip zero buckets - } #ifdef SIGNED_DIG // todo - fix const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift; const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c; @@ -198,7 +233,8 @@ namespace msm { msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1)); #else unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift; - unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)); + const unsigned single_bucket_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)); + unsigned bucket_index = msm_index * nof_buckets + single_bucket_index; #endif const unsigned bucket_offset = bucket_offsets[tid]; const unsigned bucket_size = bucket_sizes[tid]; @@ -215,7 +251,8 @@ namespace msm { #else A point = points[point_ind]; #endif - bucket = i ? bucket + point : P::from_affine(point); + bucket = + i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point)); } buckets[bucket_index] = bucket; } @@ -225,43 +262,40 @@ namespace msm { P* __restrict__ buckets, unsigned* __restrict__ bucket_offsets, unsigned* __restrict__ bucket_sizes, - unsigned* __restrict__ single_bucket_indices, - const unsigned* __restrict__ point_indices, + unsigned* __restrict__ large_bucket_thread_indices, + unsigned* __restrict__ point_indices, A* __restrict__ points, - const unsigned nof_buckets, const unsigned nof_buckets_to_compute, - const unsigned msm_idx_shift, const unsigned c, - const unsigned threads_per_bucket, - const unsigned max_run_length) + const int points_per_thread, + // log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute)) + const unsigned log_nof_buckets_to_compute, + const unsigned nof_threads) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - unsigned large_bucket_index = tid / threads_per_bucket; - unsigned bucket_segment_index = tid % threads_per_bucket; - if (tid >= nof_buckets_to_compute * threads_per_bucket) { return; } - if ((single_bucket_indices[large_bucket_index] & ((1 << c) - 1)) == 0) { // don't need - return; // skip zero buckets - } - unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index; - const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index * max_run_length; - const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index * max_run_length - ? bucket_sizes[large_bucket_index] - bucket_segment_index * max_run_length - : 0; + if (tid >= nof_threads) return; + int bucket_segment_index = large_bucket_thread_indices[tid] >> log_nof_buckets_to_compute; + large_bucket_thread_indices[tid] &= ((1 << log_nof_buckets_to_compute) - 1); + int bucket_ind = large_bucket_thread_indices[tid]; + const unsigned bucket_offset = bucket_offsets[bucket_ind] + bucket_segment_index * points_per_thread; + const unsigned bucket_size = max(0, (int)bucket_sizes[bucket_ind] - bucket_segment_index * points_per_thread); P bucket; - unsigned run_length = min(bucket_size, max_run_length); + unsigned run_length = min(bucket_size, points_per_thread); for (unsigned i = 0; i < run_length; i++) { // add the relevant points starting from the relevant offset up to the bucket size unsigned point_ind = point_indices[bucket_offset + i]; A point = points[point_ind]; - bucket = i ? bucket + point : P::from_affine(point); // init empty buckets + bucket = + i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point)); } - buckets[write_bucket_index] = run_length ? bucket : P::zero(); + buckets[tid] = run_length ? bucket : P::zero(); } template __global__ void distribute_large_buckets_kernel( const P* large_buckets, P* buckets, + const unsigned* sorted_bucket_sizes_sum, const unsigned* single_bucket_indices, const unsigned size, const unsigned nof_buckets, @@ -272,7 +306,8 @@ namespace msm { unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift; unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)); - buckets[bucket_index] = large_buckets[tid]; + unsigned large_bucket_index = sorted_bucket_sizes_sum[tid] + tid; + buckets[bucket_index] = large_buckets[large_bucket_index]; } // this kernel sums the entire bucket module @@ -370,7 +405,6 @@ namespace msm { } S* d_scalars; - A* d_points; if (!are_scalars_on_device) { // copy scalars to gpu CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * nof_scalars, stream)); @@ -378,15 +412,7 @@ namespace msm { } else { d_scalars = scalars; } - cudaStream_t stream_points; - if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points)); - if (!are_points_on_device) { - // copy points to gpu - CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points)); - CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points)); - } else { - d_points = points; - } + if (are_scalars_montgomery_form) { if (are_scalars_on_device) { S* d_mont_scalars; @@ -396,6 +422,99 @@ namespace msm { } else CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_scalars)); } + + unsigned nof_bms_per_msm = (bitsize + c - 1) / c; + unsigned* bucket_indices; + unsigned* point_indices; + unsigned* sorted_bucket_indices; + unsigned* sorted_point_indices; + CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream)); + CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream)); + CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream)); + CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream)); + + unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm)); + // split scalars into digits + unsigned NUM_THREADS = 1 << 10; + unsigned NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS; + split_scalars_kernel<<>>( + bucket_indices, point_indices, d_scalars, nof_scalars, nof_points, single_msm_size, nof_bms_per_msm, bm_bitsize, + c); + + // sort indices - the indices are sorted from smallest to largest in order to group together the points that + // belong to each bucket + unsigned* sort_indices_temp_storage{}; + size_t sort_indices_temp_storage_bytes; + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for + // more info + CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( + sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices, + point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream)); + CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream)); + // The second to last parameter is the default value supplied explicitly to allow passing the stream + // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for + // more info + CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( + sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices, + point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream)); + CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream)); + CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream)); + CHK_IF_RETURN(cudaFreeAsync(point_indices, stream)); + + // compute number of bucket modules and number of buckets in each module + unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size; +#ifdef SIGNED_DIG + const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits +#else + // minus nof_bms_per_msm because zero bucket is not included in each bucket module + const unsigned nof_buckets = (nof_bms_per_msm << c) - nof_bms_per_msm; +#endif + const unsigned total_nof_buckets = nof_buckets * batch_size; + + // find bucket_sizes + unsigned* single_bucket_indices; + unsigned* bucket_sizes; + unsigned* nof_buckets_to_compute; + // +1 here and in other places because there still is zero index corresponding to zero bucket at this point + CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * (total_nof_buckets + 1), stream)); + CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * (total_nof_buckets + 1), stream)); + CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream)); + unsigned* encode_temp_storage{}; + size_t encode_temp_storage_bytes = 0; + CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( + encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, + nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream)); + CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream)); + CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( + encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes, + nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream)); + CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream)); + CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_indices, stream)); + + // get offsets - where does each new bucket begin + unsigned* bucket_offsets; + CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * (total_nof_buckets + 1), stream)); + unsigned* offsets_temp_storage{}; + size_t offsets_temp_storage_bytes = 0; + CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( + offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream)); + CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream)); + CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( + offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream)); + CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream)); + + A* d_points; + cudaStream_t stream_points; + if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points)); + if (!are_points_on_device) { + // copy points to gpu + CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points)); + CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points)); + } else { + d_points = points; + } + if (are_points_montgomery_form) { if (are_points_on_device) { A* d_mont_points; @@ -412,111 +531,26 @@ namespace msm { } P* buckets; - // compute number of bucket modules and number of buckets in each module - unsigned nof_bms_per_msm = (bitsize + c - 1) / c; - unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm)); - unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size; -#ifdef SIGNED_DIG - const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits -#else - const unsigned nof_buckets = nof_bms_per_msm << c; -#endif - const unsigned total_nof_buckets = nof_buckets * batch_size; - CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream)); + CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * (total_nof_buckets + nof_bms_in_batch), stream)); // launch the bucket initialization kernel with maximum threads - unsigned NUM_THREADS = 1 << 10; - unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS; - initialize_buckets_kernel<<>>(buckets, total_nof_buckets); - - unsigned* bucket_indices; - unsigned* point_indices; - CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream)); - CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream)); - - // split scalars into digits NUM_THREADS = 1 << 10; - NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS; - split_scalars_kernel<<>>( - bucket_indices + nof_scalars, point_indices + nof_scalars, d_scalars, nof_scalars, nof_points, single_msm_size, - nof_bms_per_msm, bm_bitsize, c); //+nof_scalars - leaving the first bm free for the out of place sort later + NUM_BLOCKS = (total_nof_buckets + nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS; + initialize_buckets_kernel<<>>(buckets, total_nof_buckets + nof_bms_in_batch); - // sort indices - the indices are sorted from smallest to largest in order to group together the points that - // belong to each bucket - unsigned* sort_indices_temp_storage{}; - size_t sort_indices_temp_storage_bytes; - // The second to last parameter is the default value supplied explicitly to allow passing the stream - // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for - // more info - CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( - sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + nof_scalars, bucket_indices, - point_indices + nof_scalars, point_indices, nof_scalars, 0, sizeof(unsigned) * 8, stream)); - CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream)); - for (unsigned i = 0; i < nof_bms_per_msm; i++) { - unsigned offset_out = i * nof_scalars; - unsigned offset_in = offset_out + nof_scalars; - // The second to last parameter is the default value supplied explicitly to allow passing the stream - // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for - // more info - CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs( - sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, - bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, nof_scalars, 0, - sizeof(unsigned) * 8, stream)); - } - CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream)); - - // find bucket_sizes - unsigned* single_bucket_indices; - unsigned* bucket_sizes; - unsigned* nof_buckets_to_compute; - CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream)); - unsigned* encode_temp_storage{}; - size_t encode_temp_storage_bytes = 0; - CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( - encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream)); - CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream)); - CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode( - encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes, - nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream)); - CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream)); - - // get offsets - where does each new bucket begin - unsigned* bucket_offsets; - CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream)); - unsigned* offsets_temp_storage{}; - size_t offsets_temp_storage_bytes = 0; - CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); - CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream)); - CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum( - offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream)); - CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream)); + // removing zero bucket, if it exists + unsigned smallest_bucket_index; + CHK_IF_RETURN(cudaMemcpyAsync( + &smallest_bucket_index, single_bucket_indices, sizeof(unsigned), cudaMemcpyDeviceToHost, stream)); + // maybe zero bucket is empty after all? in this case zero_bucket_offset is set to 0 + unsigned zero_bucket_offset = (smallest_bucket_index == 0) ? 1 : 0; // sort by bucket sizes unsigned h_nof_buckets_to_compute; CHK_IF_RETURN(cudaMemcpyAsync( &h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream)); - - // if all points are 0 just return point 0 - if (h_nof_buckets_to_compute == 0) { - if (!are_results_on_device) { - for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) { - final_result[batch_element] = P::zero(); - } - } else { - P* h_final_result = (P*)malloc(sizeof(P) * batch_size); - for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) { - h_final_result[batch_element] = P::zero(); - } - CHK_IF_RETURN( - cudaMemcpyAsync(final_result, h_final_result, sizeof(P) * batch_size, cudaMemcpyHostToDevice, stream)); - } - - return CHK_LAST(); - } + CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream)); + h_nof_buckets_to_compute -= zero_bucket_offset; unsigned* sorted_bucket_sizes; CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned) * h_nof_buckets_to_compute, stream)); @@ -525,13 +559,16 @@ namespace msm { unsigned* sort_offsets_temp_storage{}; size_t sort_offsets_temp_storage_bytes = 0; CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending( - sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets, - sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream)); + sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset, + sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0, + sizeof(unsigned) * 8, stream)); CHK_IF_RETURN(cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream)); CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending( - sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets, - sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream)); + sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset, + sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0, + sizeof(unsigned) * 8, stream)); CHK_IF_RETURN(cudaFreeAsync(sort_offsets_temp_storage, stream)); + CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream)); unsigned* sorted_single_bucket_indices; CHK_IF_RETURN( @@ -539,92 +576,128 @@ namespace msm { unsigned* sort_single_temp_storage{}; size_t sort_single_temp_storage_bytes = 0; CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending( - sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, - single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, - stream)); + sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset, + sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices, + h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream)); CHK_IF_RETURN(cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream)); CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending( - sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, - single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, - stream)); + sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset, + sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices, + h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream)); CHK_IF_RETURN(cudaFreeAsync(sort_single_temp_storage, stream)); + CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream)); + CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream)); // find large buckets - unsigned avarage_size = single_msm_size / (1 << c); - unsigned bucket_th = large_bucket_factor * avarage_size; + unsigned average_bucket_size = single_msm_size / (1 << c); + // how large a bucket must be to qualify as a "large bucket" + unsigned bucket_th = large_bucket_factor * average_bucket_size; unsigned* nof_large_buckets; CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream)); CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned))); - unsigned TOTAL_THREADS = 129000; // todo - device dependent + unsigned TOTAL_THREADS = 129000; // TODO: device dependent unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS); unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length; - NUM_THREADS = min(1 << 5, cutoff_nof_runs); + NUM_THREADS = 1 << 5; NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS; - find_cutoff_kernel<<>>( - sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets); - + if (h_nof_buckets_to_compute > 0 && bucket_th > 0) + find_cutoff_kernel<<>>( + sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets); unsigned h_nof_large_buckets; CHK_IF_RETURN( cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream)); - - unsigned* max_res; - CHK_IF_RETURN(cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream)); - find_max_size<<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, max_res); - - unsigned h_max_res[2]; - CHK_IF_RETURN(cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream)); - unsigned h_largest_bucket_size = h_max_res[0]; - unsigned h_nof_zero_large_buckets = h_max_res[1]; - unsigned large_buckets_to_compute = - h_nof_large_buckets > h_nof_zero_large_buckets ? h_nof_large_buckets - h_nof_zero_large_buckets : 0; + CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream)); if (!are_points_on_device || are_points_montgomery_form) { // by this point, points need to be already uploaded and un-Montgomeried CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_points_uploaded)); + CHK_IF_RETURN(cudaEventDestroy(event_points_uploaded)); CHK_IF_RETURN(cudaStreamDestroy(stream_points)); } cudaStream_t stream_large_buckets; cudaEvent_t event_large_buckets_accumulated; - P* large_buckets; - if (large_buckets_to_compute > 0 && bucket_th > 0) { + // this is where handling of large buckets happens (if there are any) + if (h_nof_large_buckets > 0 && bucket_th > 0) { CHK_IF_RETURN(cudaStreamCreate(&stream_large_buckets)); CHK_IF_RETURN(cudaEventCreateWithFlags(&event_large_buckets_accumulated, cudaEventDisableTiming)); - unsigned threads_per_bucket = - 1 << (unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); // global param - unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket; - unsigned total_large_buckets_size = large_buckets_to_compute * threads_per_bucket; - CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * total_large_buckets_size, stream)); + unsigned* sorted_bucket_sizes_sum; + CHK_IF_RETURN(cudaMallocAsync( + &sorted_bucket_sizes_sum, sizeof(unsigned) * (h_nof_large_buckets + 1), stream_large_buckets)); + CHK_IF_RETURN(cudaMemsetAsync(sorted_bucket_sizes_sum, 0, sizeof(unsigned), stream_large_buckets)); + unsigned* large_bucket_temp_storage{}; + size_t large_bucket_temp_storage_bytes = 0; + CHK_IF_RETURN(cub::DeviceScan::InclusiveSum( + large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1, + h_nof_large_buckets, stream_large_buckets)); + CHK_IF_RETURN( + cudaMallocAsync(&large_bucket_temp_storage, large_bucket_temp_storage_bytes, stream_large_buckets)); + CHK_IF_RETURN(cub::DeviceScan::InclusiveSum( + large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1, + h_nof_large_buckets, stream_large_buckets)); + CHK_IF_RETURN(cudaFreeAsync(large_bucket_temp_storage, stream_large_buckets)); + unsigned h_nof_pts_in_large_buckets; + CHK_IF_RETURN(cudaMemcpyAsync( + &h_nof_pts_in_large_buckets, sorted_bucket_sizes_sum + h_nof_large_buckets, sizeof(unsigned), + cudaMemcpyDeviceToHost, stream_large_buckets)); + unsigned h_largest_bucket; + CHK_IF_RETURN(cudaMemcpyAsync( + &h_largest_bucket, sorted_bucket_sizes, sizeof(unsigned), cudaMemcpyDeviceToHost, stream_large_buckets)); - NUM_THREADS = min(1 << 8, total_large_buckets_size); - NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS; + // the number of threads for large buckets has an extra h_nof_large_buckets term to account for bucket sizes + // unevenly divisible by average_bucket_size. there are similar corrections elsewhere when accessing large + // buckets + unsigned large_buckets_nof_threads = + (h_nof_pts_in_large_buckets + average_bucket_size - 1) / average_bucket_size + h_nof_large_buckets; + unsigned log_nof_large_buckets = (unsigned)ceil(log2(h_nof_large_buckets)); + unsigned* large_bucket_indices; + CHK_IF_RETURN(cudaMallocAsync(&large_bucket_indices, sizeof(unsigned) * large_buckets_nof_threads, stream)); + NUM_THREADS = min(1 << 8, h_nof_large_buckets); + NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS; + initialize_large_bucket_indices

<<>>( + sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets, log_nof_large_buckets, + large_bucket_indices); + + P* large_buckets; + CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * large_buckets_nof_threads, stream_large_buckets)); + + NUM_THREADS = min(1 << 8, large_buckets_nof_threads); + NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS; accumulate_large_buckets_kernel<<>>( - large_buckets, sorted_bucket_offsets + h_nof_zero_large_buckets, - sorted_bucket_sizes + h_nof_zero_large_buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, - point_indices, d_points, nof_buckets, large_buckets_to_compute, c + bm_bitsize, c, threads_per_bucket, - max_bucket_size_run_length); + large_buckets, sorted_bucket_offsets, sorted_bucket_sizes, large_bucket_indices, sorted_point_indices, + d_points, h_nof_large_buckets, c, average_bucket_size, log_nof_large_buckets, large_buckets_nof_threads); + NUM_THREADS = min(MAX_TH, h_nof_large_buckets); + NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS; + // normalization is needed to update buckets sizes and offsets due to reduction that already took place + normalize_kernel<<>>( + sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets); // reduce - for (int s = total_large_buckets_size >> 1; s > large_buckets_to_compute - 1; s >>= 1) { - NUM_THREADS = min(MAX_TH, s); - NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS; - single_stage_multi_reduction_kernel<<>>( - large_buckets, large_buckets, s * 2, 0, 0, 0, s); + for (int s = h_largest_bucket; s > 1; s = ((s + 1) >> 1)) { + NUM_THREADS = min(MAX_TH, h_nof_large_buckets); + NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS; + normalize_kernel<<>>( + sorted_bucket_sizes, s == h_largest_bucket ? average_bucket_size : 2, h_nof_large_buckets); + NUM_THREADS = min(MAX_TH, large_buckets_nof_threads); + NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS; + sum_reduction_variable_size_kernel<<>>( + large_buckets, sorted_bucket_sizes_sum, sorted_bucket_sizes, large_bucket_indices, + large_buckets_nof_threads); } + CHK_IF_RETURN(cudaFreeAsync(large_bucket_indices, stream_large_buckets)); // distribute - NUM_THREADS = min(MAX_TH, large_buckets_to_compute); - NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS; + NUM_THREADS = min(MAX_TH, h_nof_large_buckets); + NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS; distribute_large_buckets_kernel<<>>( - large_buckets, buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, large_buckets_to_compute, - nof_buckets, c + bm_bitsize); + large_buckets, buckets, sorted_bucket_sizes_sum, sorted_single_bucket_indices, h_nof_large_buckets, + nof_buckets + nof_bms_per_msm, c + bm_bitsize); + CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream_large_buckets)); + CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes_sum, stream_large_buckets)); CHK_IF_RETURN(cudaEventRecord(event_large_buckets_accumulated, stream_large_buckets)); - CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets)); - } else { - h_nof_large_buckets = 0; } // launch the accumulation kernel with maximum threads @@ -633,13 +706,18 @@ namespace msm { NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS; accumulate_buckets_kernel<<>>( buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets, - sorted_single_bucket_indices + h_nof_large_buckets, point_indices, d_points, nof_buckets, - h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c); + sorted_single_bucket_indices + h_nof_large_buckets, sorted_point_indices, d_points, + nof_buckets + nof_bms_per_msm, h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c); } - - if (large_buckets_to_compute > 0 && bucket_th > 0) + CHK_IF_RETURN(cudaFreeAsync(sorted_point_indices, stream)); + CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream)); + CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream)); + CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream)); + if (h_nof_large_buckets > 0 && bucket_th > 0) { // all the large buckets need to be accumulated before the final summation CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_large_buckets_accumulated)); + CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets)); + } #ifdef SSM_SUM // sum each bucket @@ -676,7 +754,7 @@ namespace msm { unsigned source_bits_count = c; // bool odd_source_c = source_bits_count % 2; unsigned source_windows_count = nof_bms_per_msm; - unsigned source_buckets_count = nof_buckets; + unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; unsigned target_windows_count = 0; P* source_buckets = buckets; buckets = nullptr; @@ -698,20 +776,19 @@ namespace msm { for (unsigned j = 0; j < target_bits_count; j++) { const bool is_first_iter = (j == 0); const bool is_last_iter = (j == target_bits_count - 1); - unsigned nof_threads = (source_buckets_count >> (1 + j)) * batch_size; + unsigned nof_threads = + (((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size; NUM_THREADS = min(MAX_TH, nof_threads); NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/, - 0 /*=padding*/, nof_threads); + (1 << target_bits_count) - 1, nof_threads); - NUM_THREADS = min(MAX_TH, nof_threads); - NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; single_stage_multi_reduction_kernel<<>>( is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2, 1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/, - 0 /*=padding*/, nof_threads); + (1 << target_bits_count) - 1, nof_threads); } } if (target_bits_count == 1) { @@ -761,24 +838,10 @@ namespace msm { cudaMemcpyAsync(final_result, d_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream)); // free memory - if (!are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream)); - if (!are_points_on_device) CHK_IF_RETURN(cudaFreeAsync(d_points, stream)); + if (!are_scalars_on_device || are_scalars_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream)); + if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_points, stream)); if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream)); CHK_IF_RETURN(cudaFreeAsync(buckets, stream)); -#ifndef PHASE1_TEST - CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(point_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream)); - CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream)); - CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream)); -#endif - CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream)); - CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream)); - CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream)); - CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream)); - CHK_IF_RETURN(cudaFreeAsync(max_res, stream)); - if (large_buckets_to_compute > 0 && bucket_th > 0) CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream)); if (!is_async) CHK_IF_RETURN(cudaStreamSynchronize(stream)); diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu index e6948f84..3157d262 100644 --- a/icicle/appUtils/msm/tests/msm_test.cu +++ b/icicle/appUtils/msm/tests/msm_test.cu @@ -196,7 +196,7 @@ int main() printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9); config.is_big_triangle = true; config.are_results_on_device = false; - // std::cout<(scalars_d, points_d, msm_size, config, large_res); // test_reduce_triangle(scalars); diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh index b51dfc36..18fcf77e 100644 --- a/icicle/appUtils/poseidon/poseidon.cuh +++ b/icicle/appUtils/poseidon/poseidon.cuh @@ -121,7 +121,8 @@ namespace poseidon { */ // Stas: I have an issue with the number of argumnets template - cudaError_t init_optimized_poseidon_constants(device_context::DeviceContext& ctx, PoseidonConstants* constants); + cudaError_t + init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants* constants); /** * Compute the poseidon hash over a sequence of preimages. diff --git a/icicle/primitives/affine.cuh b/icicle/primitives/affine.cuh index df369036..f5cb4063 100644 --- a/icicle/primitives/affine.cuh +++ b/icicle/primitives/affine.cuh @@ -11,6 +11,8 @@ public: static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; } + static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; } + static HOST_DEVICE_INLINE Affine ToMontgomery(const Affine& point) { return {FF::ToMontgomery(point.x), FF::ToMontgomery(point.y)}; diff --git a/wrappers/rust/icicle-core/Cargo.toml b/wrappers/rust/icicle-core/Cargo.toml index 2c421385..9965faab 100644 --- a/wrappers/rust/icicle-core/Cargo.toml +++ b/wrappers/rust/icicle-core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-core" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = ["Ingonyama"] description = "A library for GPU ZK acceleration by Ingonyama" diff --git a/wrappers/rust/icicle-core/src/curve.rs b/wrappers/rust/icicle-core/src/curve.rs index 916270e8..d8b0ef6b 100644 --- a/wrappers/rust/icicle-core/src/curve.rs +++ b/wrappers/rust/icicle-core/src/curve.rs @@ -5,6 +5,8 @@ use crate::traits::{FieldImpl, MontgomeryConvertible}; use ark_ec::models::CurveConfig as ArkCurveConfig; #[cfg(feature = "arkworks")] use ark_ec::short_weierstrass::{Affine as ArkAffine, Projective as ArkProjective, SWCurveConfig}; +#[cfg(feature = "arkworks")] +use ark_ec::AffineRepr; use icicle_cuda_runtime::error::CudaError; use icicle_cuda_runtime::memory::HostOrDeviceSlice; use std::fmt::Debug; @@ -147,13 +149,17 @@ where type ArkEquivalent = ArkAffine; fn to_ark(&self) -> Self::ArkEquivalent { - let proj_x = self - .x - .to_ark(); - let proj_y = self - .y - .to_ark(); - Self::ArkEquivalent::new_unchecked(proj_x, proj_y) + if *self == Self::zero() { + Self::ArkEquivalent::zero() + } else { + let ark_x = self + .x + .to_ark(); + let ark_y = self + .y + .to_ark(); + Self::ArkEquivalent::new_unchecked(ark_x, ark_y) + } } fn from_ark(ark: Self::ArkEquivalent) -> Self { diff --git a/wrappers/rust/icicle-core/src/msm/tests.rs b/wrappers/rust/icicle-core/src/msm/tests.rs index 30c2719f..aa6afb3e 100644 --- a/wrappers/rust/icicle-core/src/msm/tests.rs +++ b/wrappers/rust/icicle-core/src/msm/tests.rs @@ -16,6 +16,15 @@ use ark_ec::VariableBaseMSM; #[cfg(feature = "arkworks")] use ark_std::{rand::Rng, test_rng, UniformRand}; +fn generate_random_affine_points_with_zeroes(size: usize, num_zeroes: usize) -> Vec> { + let rng = &mut test_rng(); + let mut points = C::generate_random_affine_points(size); + for _ in 0..num_zeroes { + points[rng.gen_range(0..size)] = Affine::::zero(); + } + points +} + pub fn check_msm>() where ::Config: GenerateRandom, @@ -36,7 +45,7 @@ where let test_sizes = [4, 8, 16, 32, 64, 128, 256, 1000, 1 << 18]; let mut msm_results = HostOrDeviceSlice::cuda_malloc(1).unwrap(); for test_size in test_sizes { - let points = C::generate_random_affine_points(test_size); + let points = generate_random_affine_points_with_zeroes(test_size, 2); let scalars = ::Config::generate_random(test_size); let points_ark: Vec<_> = points .iter() @@ -101,7 +110,7 @@ where let batch_sizes = [1, 3, 1 << 4]; for test_size in test_sizes { for batch_size in batch_sizes { - let points = C::generate_random_affine_points(test_size); + let points = generate_random_affine_points_with_zeroes(test_size, 10); let scalars = ::Config::generate_random(test_size * batch_size); // a version of batched msm without using `cfg.points_size`, requires copying bases let points_cloned: Vec> = std::iter::repeat(points.clone()) @@ -123,6 +132,7 @@ where cfg.ctx .stream = &stream; cfg.is_async = true; + cfg.large_bucket_factor = 2; msm(&scalars_h, &points_h, &cfg, &mut msm_results_1).unwrap(); msm(&scalars_h, &points_d, &cfg, &mut msm_results_2).unwrap(); @@ -170,15 +180,16 @@ where C::ScalarField: ArkConvertible::ScalarField>, C::BaseField: ArkConvertible::BaseField>, { - let test_sizes = [1 << 6, 1000]; - let test_threshold = 1 << 8; - let batch_sizes = [1, 3, 1 << 8]; + let test_sizes = [1 << 10, 10000]; + let test_threshold = 1 << 11; + let batch_sizes = [1, 3, 1 << 4]; let rng = &mut test_rng(); for test_size in test_sizes { for batch_size in batch_sizes { - let points = C::generate_random_affine_points(test_size * batch_size); + let points = generate_random_affine_points_with_zeroes(test_size * batch_size, 100); let mut scalars = vec![C::ScalarField::zero(); test_size * batch_size]; - for _ in 0..(test_size * batch_size / 2) { + + for _ in 0..(test_size * batch_size) { scalars[rng.gen_range(0..test_size * batch_size)] = C::ScalarField::one(); } for _ in test_threshold..test_size { diff --git a/wrappers/rust/icicle-cuda-runtime/Cargo.toml b/wrappers/rust/icicle-cuda-runtime/Cargo.toml index 245081bc..3bb3e40e 100644 --- a/wrappers/rust/icicle-cuda-runtime/Cargo.toml +++ b/wrappers/rust/icicle-cuda-runtime/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-cuda-runtime" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = [ "Ingonyama" ] description = "Ingonyama's Rust wrapper of CUDA runtime" diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml index 4d8cac00..c73ff5b7 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-bls12-377" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = [ "Ingonyama" ] description = "Rust wrapper for the CUDA implementation of BLS12-377 pairing friendly elliptic curve by Ingonyama" diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml index 03bf0550..f995e7ab 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-bls12-381" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = [ "Ingonyama" ] description = "Rust wrapper for the CUDA implementation of BLS12-381 pairing friendly elliptic curve by Ingonyama" diff --git a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml index f8b5703a..37bf1489 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-bn254" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = [ "Ingonyama" ] description = "Rust wrapper for the CUDA implementation of BN254 pairing friendly elliptic curve by Ingonyama" diff --git a/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml index 5d6d74ae..82819e19 100644 --- a/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "icicle-bw6-761" -version = "1.3.0" +version = "1.4.0" edition = "2021" authors = [ "Ingonyama" ] description = "Rust wrapper for the CUDA implementation of BW6-761 pairing friendly elliptic curve by Ingonyama"