From 8936d9c80028bce7d30e01a4213efbfa3c69bc02 Mon Sep 17 00:00:00 2001 From: HadarIngonyama <102164010+HadarIngonyama@users.noreply.github.com> Date: Mon, 17 Jun 2024 15:57:24 +0300 Subject: [PATCH] MSM - supporting all window sizes (#534) This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder --- icicle/include/api/bls12_377.h | 14 +- icicle/include/api/bls12_381.h | 14 +- icicle/include/api/bn254.h | 14 +- icicle/include/api/bw6_761.h | 14 +- icicle/include/api/grumpkin.h | 7 +- icicle/include/api/templates/curves/msm.h | 7 +- icicle/include/api/templates/curves/msm_g2.h | 7 +- icicle/include/msm/msm.cuh | 26 ++- icicle/src/msm/Makefile | 8 +- icicle/src/msm/extern.cu | 11 ++ icicle/src/msm/extern_g2.cu | 12 +- icicle/src/msm/msm.cu | 175 ++++++++++++------ icicle/src/msm/tests/msm_test.cu | 175 ++++++++++++------ wrappers/golang/core/msm.go | 9 +- .../golang/curves/bls12377/g2/include/msm.h | 1 + wrappers/golang/curves/bls12377/g2/msm.go | 20 +- .../golang/curves/bls12377/msm/include/msm.h | 1 + wrappers/golang/curves/bls12377/msm/msm.go | 20 +- .../curves/bls12377/tests/g2_msm_test.go | 16 +- .../golang/curves/bls12377/tests/msm_test.go | 16 +- .../golang/curves/bls12381/g2/include/msm.h | 1 + wrappers/golang/curves/bls12381/g2/msm.go | 20 +- .../golang/curves/bls12381/msm/include/msm.h | 1 + wrappers/golang/curves/bls12381/msm/msm.go | 20 +- .../curves/bls12381/tests/g2_msm_test.go | 16 +- .../golang/curves/bls12381/tests/msm_test.go | 16 +- wrappers/golang/curves/bn254/g2/include/msm.h | 1 + wrappers/golang/curves/bn254/g2/msm.go | 20 +- .../golang/curves/bn254/msm/include/msm.h | 1 + wrappers/golang/curves/bn254/msm/msm.go | 20 +- .../golang/curves/bn254/tests/g2_msm_test.go | 16 +- .../golang/curves/bn254/tests/msm_test.go | 16 +- .../golang/curves/bw6761/g2/include/msm.h | 1 + wrappers/golang/curves/bw6761/g2/msm.go | 20 +- .../golang/curves/bw6761/msm/include/msm.h | 1 + wrappers/golang/curves/bw6761/msm/msm.go | 20 +- .../golang/curves/bw6761/tests/g2_msm_test.go | 16 +- .../golang/curves/bw6761/tests/msm_test.go | 16 +- .../golang/curves/grumpkin/msm/include/msm.h | 1 + wrappers/golang/curves/grumpkin/msm/msm.go | 20 +- .../golang/curves/grumpkin/tests/msm_test.go | 16 +- .../generator/msm/templates/msm.go.tmpl | 20 +- .../generator/msm/templates/msm.h.tmpl | 1 + .../generator/msm/templates/msm_test.go.tmpl | 16 +- wrappers/rust/icicle-core/src/msm/mod.rs | 95 +++++++++- wrappers/rust/icicle-core/src/msm/tests.rs | 12 +- 46 files changed, 688 insertions(+), 282 deletions(-) diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index 7044dedb..bf6f8d1a 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -18,11 +18,8 @@ extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda( bls12_377::g2_affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bls12_377::g2_affine_t* output_bases); extern "C" cudaError_t bls12_377_g2_msm_cuda( @@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda( extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda( bls12_377::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bls12_377::affine_t* output_bases); extern "C" cudaError_t bls12_377_msm_cuda( diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index 71d3efa2..8972349b 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -18,11 +18,8 @@ extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda( bls12_381::g2_affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bls12_381::g2_affine_t* output_bases); extern "C" cudaError_t bls12_381_g2_msm_cuda( @@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda( extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda( bls12_381::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bls12_381::affine_t* output_bases); extern "C" cudaError_t bls12_381_msm_cuda( diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 14f4479a..20d8e6d1 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -19,11 +19,8 @@ extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda( bn254::g2_affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bn254::g2_affine_t* output_bases); extern "C" cudaError_t bn254_g2_msm_cuda( @@ -31,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda( extern "C" cudaError_t bn254_precompute_msm_bases_cuda( bn254::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bn254::affine_t* output_bases); extern "C" cudaError_t bn254_msm_cuda( diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 69d33f48..8d072218 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -18,11 +18,8 @@ extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda( bw6_761::g2_affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bw6_761::g2_affine_t* output_bases); extern "C" cudaError_t bw6_761_g2_msm_cuda( @@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda( extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda( bw6_761::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, bw6_761::affine_t* output_bases); extern "C" cudaError_t bw6_761_msm_cuda( diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index 09209dba..9caaddb1 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -17,11 +17,8 @@ extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda( grumpkin::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, grumpkin::affine_t* output_bases); extern "C" cudaError_t grumpkin_msm_cuda( diff --git a/icicle/include/api/templates/curves/msm.h b/icicle/include/api/templates/curves/msm.h index 7cf67eb1..8c89a3aa 100644 --- a/icicle/include/api/templates/curves/msm.h +++ b/icicle/include/api/templates/curves/msm.h @@ -1,10 +1,7 @@ extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda( ${CURVE}::affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, ${CURVE}::affine_t* output_bases); extern "C" cudaError_t ${CURVE}_msm_cuda( diff --git a/icicle/include/api/templates/curves/msm_g2.h b/icicle/include/api/templates/curves/msm_g2.h index 31aceb24..dd76a6c3 100644 --- a/icicle/include/api/templates/curves/msm_g2.h +++ b/icicle/include/api/templates/curves/msm_g2.h @@ -1,10 +1,7 @@ extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda( ${CURVE}::g2_affine_t* bases, - int bases_size, - int precompute_factor, - int _c, - bool are_bases_on_device, - device_context::DeviceContext& ctx, + int msm_size, + msm::MSMConfig& config, ${CURVE}::g2_affine_t* output_bases); extern "C" cudaError_t ${CURVE}_g2_msm_cuda( diff --git a/icicle/include/msm/msm.cuh b/icicle/include/msm/msm.cuh index a18800d2..1bd27a48 100644 --- a/icicle/include/msm/msm.cuh +++ b/icicle/include/msm/msm.cuh @@ -43,7 +43,7 @@ namespace msm { * points, it should be set to the product of MSM size and [batch_size](@ref * batch_size). Default value: 0 (meaning it's equal to the MSM size). */ int precompute_factor; /**< The number of extra points to pre-compute for each point. See the - * [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed + * [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed * there needs to be equal to the one used here. Larger values decrease the * number of computations to make, on-line memory footprint, but increase the static * memory footprint. Default value: 1 (i.e. don't pre-compute). */ @@ -52,7 +52,7 @@ namespace msm { * means more on-line memory footprint but also more parallelism and less computational * complexity (up to a certain point). Currently pre-computation is independent of * \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the - * [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical. + * [precompute_msm_points](@ref precompute_msm_points) function will need to be identical. * Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */ int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field, * but if a different (better) upper bound is known, it should be reflected in this @@ -127,6 +127,26 @@ namespace msm { template cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results); + /** + * A function that precomputes MSM bases by extending them with their shifted copies. + * e.g.: + * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$ + * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size}, + * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$ + * @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated. + * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is + * the size of 1 MSM. + * @param config [MSMConfig](@ref MSMConfig) used in this MSM. + * @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended + * points. + * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine + * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point. + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + * + */ + template + cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points); + /** * A function that precomputes MSM bases by extending them with their shifted copies. * e.g.: @@ -148,7 +168,7 @@ namespace msm { * */ template - cudaError_t precompute_msm_bases( + [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases( A* bases, int bases_size, int precompute_factor, diff --git a/icicle/src/msm/Makefile b/icicle/src/msm/Makefile index aed6c88c..b3cda9ca 100644 --- a/icicle/src/msm/Makefile +++ b/icicle/src/msm/Makefile @@ -1,4 +1,8 @@ +build_msm: + mkdir -p work + nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu + test_msm: mkdir -p work - nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu - work/test_msm + nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu + work/test_msm \ No newline at end of file diff --git a/icicle/src/msm/extern.cu b/icicle/src/msm/extern.cu index cab9186f..42bc31db 100644 --- a/icicle/src/msm/extern.cu +++ b/icicle/src/msm/extern.cu @@ -8,6 +8,17 @@ using namespace field_config; #include "utils/utils.h" namespace msm { + /** + * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of + * template parameters (where the curve is given by `-DCURVE` env variable during build): + * - `A` is the [affine representation](@ref affine_t) of curve points; + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ + extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)( + affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points) + { + return precompute_msm_points(points, msm_size, config, output_points); + } /** * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of * template parameters (where the curve is given by `-DCURVE` env variable during build): diff --git a/icicle/src/msm/extern_g2.cu b/icicle/src/msm/extern_g2.cu index 077fb65d..1a84c66e 100644 --- a/icicle/src/msm/extern_g2.cu +++ b/icicle/src/msm/extern_g2.cu @@ -8,6 +8,17 @@ using namespace field_config; #include "utils/utils.h" namespace msm { + /** + * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of + * template parameters (where the curve is given by `-DCURVE` env variable during build): + * - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points; + * @return `cudaSuccess` if the execution was successful and an error code otherwise. + */ + extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_points_cuda)( + g2_affine_t* points, int msm_size, MSMConfig& config, g2_affine_t* output_points) + { + return precompute_msm_points(points, msm_size, config, output_points); + } /** * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of * template parameters (where the curve is given by `-DCURVE` env variable during build): @@ -26,7 +37,6 @@ namespace msm { return precompute_msm_bases( bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases); } - /** * Extern "C" version of [msm](@ref msm) function with the following values of template parameters * (where the curve is given by `-DCURVE` env variable during build): diff --git a/icicle/src/msm/msm.cu b/icicle/src/msm/msm.cu index 004b2bec..18a99f21 100644 --- a/icicle/src/msm/msm.cu +++ b/icicle/src/msm/msm.cu @@ -22,7 +22,6 @@ namespace msm { #define MAX_TH 256 - // #define SIGNED_DIG //WIP // #define SSM_SUM //WIP template @@ -74,10 +73,10 @@ namespace msm { unsigned* bucket_sizes_sum, unsigned* bucket_sizes, unsigned* large_bucket_thread_indices, - unsigned num_of_threads) + unsigned nof_threads) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= num_of_threads) { return; } + if (tid >= nof_threads) { return; } unsigned large_bucket_tid = large_bucket_thread_indices[tid]; unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid; @@ -91,12 +90,13 @@ namespace msm { P* v_r, unsigned block_size, unsigned write_stride, + unsigned buckets_per_bm, unsigned write_phase, unsigned step, - unsigned num_of_threads) + unsigned nof_threads) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= num_of_threads) { return; } + if (tid >= nof_threads) return; // we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them. // for write_phase==1, the read pattern is different so we don't skip over anything. @@ -110,7 +110,7 @@ namespace msm { const unsigned read_ind = block_size * shifted_block_id + block_tid; const unsigned write_ind = jump * shifted_block_id + block_tid; const unsigned v_r_key = - write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride + write_stride ? ((write_ind / buckets_per_bm) * 2 + write_phase) * write_stride + write_ind % buckets_per_bm : write_ind; v_r[v_r_key] = v[read_ind] + v[read_ind + jump]; } @@ -325,32 +325,50 @@ namespace msm { } template - __global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums) + __global__ void last_pass_kernel( + const P* final_buckets, + P* final_sums, + unsigned nof_sums_per_batch, + unsigned batch_size, + unsigned nof_bms_per_batch, + unsigned orig_c) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid >= num_sums) return; - final_sums[tid] = final_buckets[2 * tid + 1]; + if (tid >= nof_sums_per_batch * batch_size) return; + unsigned batch_index = tid / nof_sums_per_batch; + unsigned batch_tid = tid % nof_sums_per_batch; + unsigned bm_index = batch_tid / orig_c; + unsigned bm_tid = batch_tid % orig_c; + for (unsigned c = orig_c; c > 1;) { + c = (c + 1) >> 1; + bm_index <<= 1; + if (bm_tid >= c) { + bm_index++; + bm_tid -= c; + } + } + final_sums[tid] = final_buckets[2 * (batch_index * nof_bms_per_batch + bm_index) + 1]; } // this kernel computes the final result using the double and add algorithm // it is done by a single thread template __global__ void final_accumulation_kernel( - const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned nof_empty_bms, unsigned c) + const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_results, unsigned c) { unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nof_msms) return; P final_result = P::zero(); // Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore // skipping them. - for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) { - final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add - for (unsigned j = 0; j < c; j++) // double + for (unsigned i = nof_results; i > 1; i--) { + final_result = final_result + final_sums[i - 1 + tid * nof_results]; // add + for (unsigned j = 0; j < c; j++) // double { final_result = final_result + final_result; } } - final_results[tid] = final_result + final_sums[tid * nof_bms]; + final_results[tid] = final_result + final_sums[tid * nof_results]; } // this function computes msm using the bucket method @@ -717,7 +735,8 @@ namespace msm { CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream)); // --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window --- - unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0 + unsigned nof_final_results_per_msm = + nof_bms_per_msm; // for big-triangle accumluation this is the number of bucket modules P* final_results; if (is_big_triangle || c == 1) { CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream)); @@ -726,62 +745,90 @@ namespace msm { NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS; big_triangle_sum_kernel<<>>(buckets, final_results, nof_bms_in_batch, c); } else { + // the recursive reduction algorithm works with 2 types of reduction that can run on parallel streams + cudaStream_t stream_reduction; + cudaEvent_t event_finished_reduction; + CHK_IF_RETURN(cudaStreamCreate(&stream_reduction)); + CHK_IF_RETURN(cudaEventCreateWithFlags(&event_finished_reduction, cudaEventDisableTiming)); + unsigned source_bits_count = c; unsigned source_windows_count = nof_bms_per_msm; - unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; - unsigned target_windows_count = 0; + unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; // nof buckets per msm including zero buckets + unsigned target_windows_count; P* source_buckets = buckets; buckets = nullptr; P* target_buckets; P* temp_buckets1; P* temp_buckets2; for (unsigned i = 0;; i++) { - const unsigned target_bits_count = (source_bits_count + 1) >> 1; // c/2=8 - target_windows_count = source_windows_count << 1; // nof bms*2 = 32 - const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8 + const unsigned target_bits_count = (source_bits_count + 1) >> 1; // half the bits rounded up + target_windows_count = source_windows_count << 1; // twice the number of bms + const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c + CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); CHK_IF_RETURN(cudaMallocAsync( - &target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets + &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, + stream)); // for type1 reduction (strided, bottom window - evens) CHK_IF_RETURN(cudaMallocAsync( - &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets - CHK_IF_RETURN(cudaMallocAsync( - &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets + &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, + stream)); // for type2 reduction (serial, top window - odds) + initialize_buckets_kernel<<<(target_buckets_count * batch_size + 255) / 256, 256>>>( + target_buckets, target_buckets_count * batch_size); // initialization is needed for the odd c case - if (source_bits_count > 0) { - for (unsigned j = 0; j < target_bits_count; j++) { - const bool is_first_iter = (j == 0); - const bool is_last_iter = (j == target_bits_count - 1); - unsigned nof_threads = - (((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size; - NUM_THREADS = max(1, min(MAX_TH, nof_threads)); - NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; + for (unsigned j = 0; j < target_bits_count; j++) { + const bool is_first_iter = (j == 0); + const bool is_second_iter = (j == 1); + const bool is_last_iter = (j == target_bits_count - 1); + const bool is_odd_c = source_bits_count & 1; + unsigned nof_threads = + (((source_windows_count << target_bits_count) - source_windows_count) << (target_bits_count - 1 - j)) * + batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by nof threads + // per section + NUM_THREADS = max(1, min(MAX_TH, nof_threads)); + NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; + if (!is_odd_c || !is_first_iter) { // skip if c is odd and it's the first iteration single_stage_multi_reduction_kernel<<>>( - is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1, - 1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/, - (1 << target_bits_count) - 1, nof_threads); - - single_stage_multi_reduction_kernel<<>>( - is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2, - 1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/, + is_first_iter || (is_second_iter && is_odd_c) ? source_buckets : temp_buckets1, + is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j + (is_odd_c ? 1 : 0)), + is_last_iter ? 1 << target_bits_count : 0, 1 << target_bits_count, 0 /*=write_phase*/, (1 << target_bits_count) - 1, nof_threads); } + + nof_threads = (((source_windows_count << (source_bits_count - target_bits_count)) - source_windows_count) + << (target_bits_count - 1 - j)) * + batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by + // nof threads per section + NUM_THREADS = max(1, min(MAX_TH, nof_threads)); + NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS; + single_stage_multi_reduction_kernel<<>>( + is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2, + 1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, + 1 << (target_bits_count - (is_odd_c ? 1 : 0)), 1 /*=write_phase*/, + (1 << (target_bits_count - (is_odd_c ? 1 : 0))) - 1, nof_threads); } + CHK_IF_RETURN(cudaEventRecord(event_finished_reduction, stream_reduction)); + CHK_IF_RETURN( + cudaStreamWaitEvent(stream, event_finished_reduction)); // sync streams after every write to target_buckets if (target_bits_count == 1) { // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed // to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction // ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b. + // precomputation and odd c can cause additional empty windows. + nof_final_results_per_msm = min(c * nof_bms_per_msm, bitsize); nof_bms_per_msm = target_windows_count; - nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0; - nof_bms_in_batch = nof_bms_per_msm * batch_size; + unsigned total_nof_final_results = nof_final_results_per_msm * batch_size; + + CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * total_nof_final_results, stream)); - CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream)); NUM_THREADS = 32; - NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS; - last_pass_kernel<<>>(target_buckets, final_results, nof_bms_in_batch); + NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS; + last_pass_kernel<<>>( + target_buckets, final_results, nof_final_results_per_msm, batch_size, nof_bms_per_msm, c); c = 1; CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream)); CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream)); CHK_IF_RETURN(cudaFreeAsync(temp_buckets1, stream)); CHK_IF_RETURN(cudaFreeAsync(temp_buckets2, stream)); + CHK_IF_RETURN(cudaStreamDestroy(stream_reduction)); break; } CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream)); @@ -803,8 +850,8 @@ namespace msm { NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS; // launch the double and add kernel, a single thread per batch element final_accumulation_kernel<<>>( - final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm, - nof_empty_bms_per_batch, c); + final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, + nof_final_results_per_msm, c); CHK_IF_RETURN(cudaFreeAsync(final_results, stream)); if (!are_results_on_device) @@ -829,13 +876,7 @@ namespace msm { const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize; cudaStream_t& stream = config.ctx.stream; - unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16; - // reduce c to closest power of two (from below) if not using big_triangle reduction logic - // TODO: support arbitrary values of c - if (!config.is_big_triangle) { - while ((c & (c - 1)) != 0) - c &= (c - 1); - } + unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c; return CHK_STICKY(bucket_method_msm( bitsize, c, scalars, points, config.batch_size, msm_size, @@ -846,7 +887,33 @@ namespace msm { } template - cudaError_t precompute_msm_bases( + cudaError_t precompute_msm_points(A* points, int msm_size, MSMConfig& config, A* output_points) + { + CHK_INIT_IF_RETURN(); + + cudaStream_t& stream = config.ctx.stream; + unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c; + + CHK_IF_RETURN(cudaMemcpyAsync( + output_points, points, sizeof(A) * config.points_size, + config.are_points_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream)); + + unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1; + unsigned shift = c * ((total_nof_bms - 1) / config.precompute_factor + 1); + + unsigned NUM_THREADS = 1 << 8; + unsigned NUM_BLOCKS = (config.points_size + NUM_THREADS - 1) / NUM_THREADS; + for (int i = 1; i < config.precompute_factor; i++) { + left_shift_kernel<<>>( + &output_points[(i - 1) * config.points_size], shift, config.points_size, + &output_points[i * config.points_size]); + } + + return CHK_LAST(); + } + + template + [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases( A* bases, int bases_size, int precompute_factor, diff --git a/icicle/src/msm/tests/msm_test.cu b/icicle/src/msm/tests/msm_test.cu index ad8a33d7..005a00c5 100644 --- a/icicle/src/msm/tests/msm_test.cu +++ b/icicle/src/msm/tests/msm_test.cu @@ -1,3 +1,9 @@ +#include "fields/id.h" +// #define FIELD_ID 2 +#define CURVE_ID 3 +#include "curves/curve_config.cuh" +// #include "fields/field_config.cuh" + #include "msm.cu" #include @@ -9,7 +15,7 @@ #include "curves/projective.cuh" #include "gpu-utils/device_context.cuh" -using namespace bn254; +// using namespace bn254; class Dummy_Scalar { @@ -111,20 +117,34 @@ public: // switch between dummy and real: -typedef scalar_t test_scalar; -typedef projective_t test_projective; -typedef affine_t test_affine; +// typedef scalar_t test_scalar; +// typedef projective_t test_projective; +// typedef affine_t test_affine; + +typedef curve_config::scalar_t test_scalar; +typedef curve_config::projective_t test_projective; +typedef curve_config::affine_t test_affine; // typedef Dummy_Scalar test_scalar; // typedef Dummy_Projective test_projective; // typedef Dummy_Projective test_affine; -int main() +int main(int argc, char** argv) { - int batch_size = 1; + cudaEvent_t start, stop; + float msm_time; + + int msm_log_size = (argc > 1) ? atoi(argv[1]) : 17; + int msm_size = 1 << msm_log_size; + int batch_size = (argc > 2) ? atoi(argv[2]) : 4; // unsigned msm_size = 1<<21; - int msm_size = 12180757; int N = batch_size * msm_size; + int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1; + int user_c = (argc > 4) ? atoi(argv[4]) : 15; + + printf( + "running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n", CURVE_ID, msm_log_size, batch_size, + precomp_factor, user_c); test_scalar* scalars = new test_scalar[N]; test_affine* points = new test_affine[N]; @@ -136,7 +156,8 @@ int main() // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t)); // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective)); - test_projective large_res[2]; + test_projective res[batch_size]; + test_projective ref[batch_size]; // test_projective batched_large_res[batch_size]; // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point)); // fake_point batched_large_res[256]; @@ -149,13 +170,17 @@ int main() test_scalar* scalars_d; test_affine* points_d; - test_projective* large_res_d; + test_affine* precomp_points_d; + test_projective* res_d; + test_projective* ref_d; - cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size); - cudaMalloc(&points_d, sizeof(test_affine) * msm_size); - cudaMalloc(&large_res_d, sizeof(test_projective)); - cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice); - cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice); + cudaMalloc(&scalars_d, sizeof(test_scalar) * N); + cudaMalloc(&points_d, sizeof(test_affine) * N); + cudaMalloc(&precomp_points_d, sizeof(test_affine) * N * precomp_factor); + cudaMalloc(&res_d, sizeof(test_projective) * batch_size); + cudaMalloc(&ref_d, sizeof(test_projective) * batch_size); + cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice); + cudaMemcpy(points_d, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice); std::cout << "finished copying" << std::endl; @@ -170,65 +195,93 @@ int main() 0, // mempool }; msm::MSMConfig config = { - ctx, // DeviceContext - 0, // points_size - 1, // precompute_factor - 0, // c - 0, // bitsize - 10, // large_bucket_factor - 1, // batch_size - false, // are_scalars_on_device - false, // are_scalars_montgomery_form - false, // are_points_on_device - false, // are_points_montgomery_form - true, // are_results_on_device - false, // is_big_triangle - true, // is_async + ctx, // DeviceContext + N, // points_size + precomp_factor, // precompute_factor + user_c, // c + 0, // bitsize + 10, // large_bucket_factor + batch_size, // batch_size + false, // are_scalars_on_device + false, // are_scalars_montgomery_form + true, // are_points_on_device + false, // are_points_montgomery_form + true, // are_results_on_device + false, // is_big_triangle + true, // is_async + // false, // segments_reduction }; - auto begin1 = std::chrono::high_resolution_clock::now(); - msm::msm(scalars, points, msm_size, config, large_res_d); - cudaEvent_t msm_end_event; - cudaEventCreate(&msm_end_event); - auto end1 = std::chrono::high_resolution_clock::now(); - auto elapsed1 = std::chrono::duration_cast(end1 - begin1); - printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9); + cudaEventCreate(&start); + cudaEventCreate(&stop); + + if (precomp_factor > 1) + msm::precompute_msm_points(points_d, msm_size, config, precomp_points_d); + + // warm up + msm::msm( + scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d); + cudaDeviceSynchronize(); + + // auto begin1 = std::chrono::high_resolution_clock::now(); + cudaEventRecord(start, stream); + msm::msm( + scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d); + cudaEventRecord(stop, stream); + cudaStreamSynchronize(stream); + cudaEventElapsedTime(&msm_time, start, stop); + // cudaEvent_t msm_end_event; + // cudaEventCreate(&msm_end_event); + // auto end1 = std::chrono::high_resolution_clock::now(); + // auto elapsed1 = std::chrono::duration_cast(end1 - begin1); + printf("msm time : %.3f ms.\n", msm_time); + + // reference + config.c = 16; + config.precompute_factor = 1; config.is_big_triangle = true; - config.are_results_on_device = false; - cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost); - std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1]) - << std::endl; - auto begin = std::chrono::high_resolution_clock::now(); - msm::msm(scalars_d, points_d, msm_size, config, large_res); + config.batch_size = 1; + config.points_size = msm_size; + // config.segments_reduction = false; + for (int i = 0; i < batch_size; i++) { + msm::msm( + scalars + i * msm_size, points_d + i * msm_size, msm_size, config, ref_d + i); + } + + // config.are_results_on_device = false; + // std::cout << test_projective::to_affine(large_res[0]) << std::endl; + // auto begin = std::chrono::high_resolution_clock::now(); + // msm::MSM(scalars_d, points_d, msm_size, config, large_res); // test_reduce_triangle(scalars); // test_reduce_rectangle(scalars); // test_reduce_single(scalars); // test_reduce_var(scalars); - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed = std::chrono::duration_cast(end - begin); - printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9); + // auto end = std::chrono::high_resolution_clock::now(); + // auto elapsed = std::chrono::duration_cast(end - begin); + // printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); + // std::cout << test_projective::to_affine(large_res[0]) << std::endl; + + cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost); + cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost); + // reference_msm(scalars, points, msm_size); // std::cout<<"final results batched large"< { ctx: &DeviceContext, output_bases: &mut DeviceSlice>, ) -> IcicleResult<()>; + + fn precompute_points_unchecked( + points: &(impl HostOrDeviceSlice> + ?Sized), + msm_size: i32, + cfg: &MSMConfig, + output_bases: &mut DeviceSlice>, + ) -> IcicleResult<()>; } /// Computes the multi-scalar multiplication, or MSM: `s1*P1 + s2*P2 + ... + sn*Pn`, or a batch of several MSMs. @@ -188,7 +195,7 @@ pub fn msm>( /// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size}, /// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$ /// -/// * `bases` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated. +/// * `points` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated. /// /// * `precompute_factor` - The number of total precomputed points for each base (including the base itself). /// @@ -201,6 +208,7 @@ pub fn msm>( /// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases. /// /// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise. +#[deprecated(since = "2.5.0", note = "Please use `precompute_points` instead")] pub fn precompute_bases>( points: &(impl HostOrDeviceSlice> + ?Sized), precompute_factor: i32, @@ -220,6 +228,55 @@ pub fn precompute_bases>( C::precompute_bases_unchecked(points, precompute_factor, _c, ctx, output_bases) } +/// A function that precomputes MSM bases by extending them with their shifted copies. +/// e.g.: +/// Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$ +/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size}, +/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$ +/// +/// * `points` - Base points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated. +/// +/// * `msm_size` - The size of a single MSM to compute. +/// +/// * `cfg` - config used to specify extra arguments of the MSM. NOTE: You should update the precompute_factor +/// and potentially the c value inside this config. This config should be the same config used in msm. +/// +/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases. +/// +/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise. +pub fn precompute_points>( + points: &(impl HostOrDeviceSlice> + ?Sized), + msm_size: i32, + cfg: &MSMConfig, + output_bases: &mut DeviceSlice>, +) -> IcicleResult<()> { + let precompute_factor = cfg.precompute_factor; + assert_eq!( + output_bases.len(), + points.len() * (precompute_factor as usize), + "Precompute factor is probably incorrect: expected {} but got {}", + output_bases.len() / points.len(), + precompute_factor + ); + assert!(output_bases.is_on_device()); + + let ctx_device_id = cfg + .ctx + .device_id; + if let Some(device_id) = points.device_id() { + assert_eq!( + device_id, ctx_device_id, + "Device ids in points and context are different" + ); + } + check_device(ctx_device_id); + let mut local_cfg = cfg.clone(); + local_cfg.points_size = points.len() as i32; + local_cfg.are_points_on_device = points.is_on_device(); + + C::precompute_points_unchecked(points, msm_size, &local_cfg, output_bases) +} + #[macro_export] macro_rules! impl_msm { ( @@ -250,6 +307,14 @@ macro_rules! impl_msm { ctx: &DeviceContext, output_bases: *mut Affine<$curve>, ) -> CudaError; + + #[link_name = concat!($curve_prefix, "_precompute_msm_points_cuda")] + pub(crate) fn precompute_points_cuda( + points: *const Affine<$curve>, + msm_size: i32, + cfg: &MSMConfig, + output_bases: *mut Affine<$curve>, + ) -> CudaError; } } @@ -292,6 +357,23 @@ macro_rules! impl_msm { .wrap() } } + + fn precompute_points_unchecked( + points: &(impl HostOrDeviceSlice> + ?Sized), + msm_size: i32, + cfg: &MSMConfig, + output_bases: &mut DeviceSlice>, + ) -> IcicleResult<()> { + unsafe { + $curve_prefix_indent::precompute_points_cuda( + points.as_ptr(), + msm_size, + cfg, + output_bases.as_mut_ptr(), + ) + .wrap() + } + } } }; } @@ -363,7 +445,7 @@ macro_rules! impl_msm_bench { group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - use icicle_core::msm::precompute_bases; + use icicle_core::msm::precompute_points; use icicle_core::msm::tests::generate_random_affine_points_with_zeroes; use icicle_cuda_runtime::stream::CudaStream; @@ -396,11 +478,11 @@ macro_rules! impl_msm_bench { let points = generate_random_affine_points_with_zeroes(test_size, 10); for precompute_factor in [1, 4, 8] { let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap(); - precompute_bases( + cfg.precompute_factor = precompute_factor as i32; + precompute_points( HostSlice::from_slice(&points), - precompute_factor as i32, - 0, - &cfg.ctx, + test_size as i32, + &cfg, &mut precomputed_points_d, ) .unwrap(); @@ -419,7 +501,6 @@ macro_rules! impl_msm_bench { let scalars_h = HostSlice::from_slice(&scalars); let mut msm_results = DeviceVec::>::cuda_malloc(batch_size).unwrap(); - cfg.precompute_factor = precompute_factor as i32; let bench_descr = format!( " {} x {} with precomp = {:?}", diff --git a/wrappers/rust/icicle-core/src/msm/tests.rs b/wrappers/rust/icicle-core/src/msm/tests.rs index 27b23165..8cf8c39d 100644 --- a/wrappers/rust/icicle-core/src/msm/tests.rs +++ b/wrappers/rust/icicle-core/src/msm/tests.rs @@ -1,5 +1,5 @@ use crate::curve::{Affine, Curve, Projective}; -use crate::msm::{msm, precompute_bases, MSMConfig, MSM}; +use crate::msm::{msm, precompute_points, MSMConfig, MSM}; use crate::traits::{FieldImpl, GenerateRandom}; use icicle_cuda_runtime::device::{get_device_count, set_device, warmup}; use icicle_cuda_runtime::memory::{DeviceVec, HostSlice}; @@ -120,16 +120,16 @@ where cfg.is_async = true; cfg.large_bucket_factor = 5; cfg.c = 4; + let precompute_factor = 8; warmup(&stream).unwrap(); for test_size in test_sizes { - let precompute_factor = 8; let points = generate_random_affine_points_with_zeroes(test_size, 10); let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap(); - precompute_bases( + cfg.precompute_factor = precompute_factor as i32; + precompute_points( HostSlice::from_slice(&points), - precompute_factor as i32, - 0, - &cfg.ctx, + test_size as i32, + &cfg, &mut precomputed_points_d, ) .unwrap();