MSM - supporting all window sizes (#534)

This PR enables using MSM with any value of c.

Note: default c isn't necessarily optimal, the user is expected to
choose c and the precomputation factor that give the best results for
the relevant case.

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
This commit is contained in:
HadarIngonyama
2024-06-17 15:57:24 +03:00
committed by GitHub
parent af9ec76506
commit 8936d9c800
46 changed files with 688 additions and 282 deletions

View File

@@ -18,11 +18,8 @@
extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
bls12_377::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bls12_377::g2_affine_t* output_bases);
extern "C" cudaError_t bls12_377_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(
extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
bls12_377::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bls12_377::affine_t* output_bases);
extern "C" cudaError_t bls12_377_msm_cuda(

View File

@@ -18,11 +18,8 @@
extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
bls12_381::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bls12_381::g2_affine_t* output_bases);
extern "C" cudaError_t bls12_381_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(
extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
bls12_381::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bls12_381::affine_t* output_bases);
extern "C" cudaError_t bls12_381_msm_cuda(

View File

@@ -19,11 +19,8 @@
extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
bn254::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bn254::g2_affine_t* output_bases);
extern "C" cudaError_t bn254_g2_msm_cuda(
@@ -31,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(
extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
bn254::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bn254::affine_t* output_bases);
extern "C" cudaError_t bn254_msm_cuda(

View File

@@ -18,11 +18,8 @@
extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
bw6_761::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bw6_761::g2_affine_t* output_bases);
extern "C" cudaError_t bw6_761_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(
extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
bw6_761::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
bw6_761::affine_t* output_bases);
extern "C" cudaError_t bw6_761_msm_cuda(

View File

@@ -17,11 +17,8 @@
extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
grumpkin::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
grumpkin::affine_t* output_bases);
extern "C" cudaError_t grumpkin_msm_cuda(

View File

@@ -1,10 +1,7 @@
extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
${CURVE}::affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
${CURVE}::affine_t* output_bases);
extern "C" cudaError_t ${CURVE}_msm_cuda(

View File

@@ -1,10 +1,7 @@
extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
${CURVE}::g2_affine_t* bases,
int bases_size,
int precompute_factor,
int _c,
bool are_bases_on_device,
device_context::DeviceContext& ctx,
int msm_size,
msm::MSMConfig& config,
${CURVE}::g2_affine_t* output_bases);
extern "C" cudaError_t ${CURVE}_g2_msm_cuda(

View File

@@ -43,7 +43,7 @@ namespace msm {
* points, it should be set to the product of MSM size and [batch_size](@ref
* batch_size). Default value: 0 (meaning it's equal to the MSM size). */
int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
* [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
* [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
* there needs to be equal to the one used here. Larger values decrease the
* number of computations to make, on-line memory footprint, but increase the static
* memory footprint. Default value: 1 (i.e. don't pre-compute). */
@@ -52,7 +52,7 @@ namespace msm {
* means more on-line memory footprint but also more parallelism and less computational
* complexity (up to a certain point). Currently pre-computation is independent of
* \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
* [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
* [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
* Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
* but if a different (better) upper bound is known, it should be reflected in this
@@ -127,6 +127,26 @@ namespace msm {
template <typename S, typename A, typename P>
cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
/**
* A function that precomputes MSM bases by extending them with their shifted copies.
* e.g.:
* Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
* Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
* 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
* @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
* @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
* the size of 1 MSM.
* @param config [MSMConfig](@ref MSMConfig) used in this MSM.
* @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended
* points.
* @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
* Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*
*/
template <typename A, typename P>
cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points);
/**
* A function that precomputes MSM bases by extending them with their shifted copies.
* e.g.:
@@ -148,7 +168,7 @@ namespace msm {
*
*/
template <typename A, typename P>
cudaError_t precompute_msm_bases(
[[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
A* bases,
int bases_size,
int precompute_factor,

View File

@@ -1,4 +1,8 @@
build_msm:
mkdir -p work
nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
test_msm:
mkdir -p work
nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu
work/test_msm
nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
work/test_msm

View File

@@ -8,6 +8,17 @@ using namespace field_config;
#include "utils/utils.h"
namespace msm {
/**
* Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref affine_t) of curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)(
affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points)
{
return precompute_msm_points<affine_t, projective_t>(points, msm_size, config, output_points);
}
/**
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):

View File

@@ -8,6 +8,17 @@ using namespace field_config;
#include "utils/utils.h"
namespace msm {
/**
* Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_points_cuda)(
g2_affine_t* points, int msm_size, MSMConfig& config, g2_affine_t* output_points)
{
return precompute_msm_points<g2_affine_t, g2_projective_t>(points, msm_size, config, output_points);
}
/**
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
* template parameters (where the curve is given by `-DCURVE` env variable during build):
@@ -26,7 +37,6 @@ namespace msm {
return precompute_msm_bases<g2_affine_t, g2_projective_t>(
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
}
/**
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
* (where the curve is given by `-DCURVE` env variable during build):

View File

@@ -22,7 +22,6 @@ namespace msm {
#define MAX_TH 256
// #define SIGNED_DIG //WIP
// #define SSM_SUM //WIP
template <typename A, typename P>
@@ -74,10 +73,10 @@ namespace msm {
unsigned* bucket_sizes_sum,
unsigned* bucket_sizes,
unsigned* large_bucket_thread_indices,
unsigned num_of_threads)
unsigned nof_threads)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_of_threads) { return; }
if (tid >= nof_threads) { return; }
unsigned large_bucket_tid = large_bucket_thread_indices[tid];
unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
@@ -91,12 +90,13 @@ namespace msm {
P* v_r,
unsigned block_size,
unsigned write_stride,
unsigned buckets_per_bm,
unsigned write_phase,
unsigned step,
unsigned num_of_threads)
unsigned nof_threads)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_of_threads) { return; }
if (tid >= nof_threads) return;
// we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
// for write_phase==1, the read pattern is different so we don't skip over anything.
@@ -110,7 +110,7 @@ namespace msm {
const unsigned read_ind = block_size * shifted_block_id + block_tid;
const unsigned write_ind = jump * shifted_block_id + block_tid;
const unsigned v_r_key =
write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
write_stride ? ((write_ind / buckets_per_bm) * 2 + write_phase) * write_stride + write_ind % buckets_per_bm
: write_ind;
v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
}
@@ -325,32 +325,50 @@ namespace msm {
}
template <typename P>
__global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums)
__global__ void last_pass_kernel(
const P* final_buckets,
P* final_sums,
unsigned nof_sums_per_batch,
unsigned batch_size,
unsigned nof_bms_per_batch,
unsigned orig_c)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= num_sums) return;
final_sums[tid] = final_buckets[2 * tid + 1];
if (tid >= nof_sums_per_batch * batch_size) return;
unsigned batch_index = tid / nof_sums_per_batch;
unsigned batch_tid = tid % nof_sums_per_batch;
unsigned bm_index = batch_tid / orig_c;
unsigned bm_tid = batch_tid % orig_c;
for (unsigned c = orig_c; c > 1;) {
c = (c + 1) >> 1;
bm_index <<= 1;
if (bm_tid >= c) {
bm_index++;
bm_tid -= c;
}
}
final_sums[tid] = final_buckets[2 * (batch_index * nof_bms_per_batch + bm_index) + 1];
}
// this kernel computes the final result using the double and add algorithm
// it is done by a single thread
template <typename P, typename S>
__global__ void final_accumulation_kernel(
const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned nof_empty_bms, unsigned c)
const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_msms) return;
P final_result = P::zero();
// Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
// skipping them.
for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
for (unsigned j = 0; j < c; j++) // double
for (unsigned i = nof_results; i > 1; i--) {
final_result = final_result + final_sums[i - 1 + tid * nof_results]; // add
for (unsigned j = 0; j < c; j++) // double
{
final_result = final_result + final_result;
}
}
final_results[tid] = final_result + final_sums[tid * nof_bms];
final_results[tid] = final_result + final_sums[tid * nof_results];
}
// this function computes msm using the bucket method
@@ -717,7 +735,8 @@ namespace msm {
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream));
// --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window ---
unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0
unsigned nof_final_results_per_msm =
nof_bms_per_msm; // for big-triangle accumluation this is the number of bucket modules
P* final_results;
if (is_big_triangle || c == 1) {
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
@@ -726,62 +745,90 @@ namespace msm {
NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms_in_batch, c);
} else {
// the recursive reduction algorithm works with 2 types of reduction that can run on parallel streams
cudaStream_t stream_reduction;
cudaEvent_t event_finished_reduction;
CHK_IF_RETURN(cudaStreamCreate(&stream_reduction));
CHK_IF_RETURN(cudaEventCreateWithFlags(&event_finished_reduction, cudaEventDisableTiming));
unsigned source_bits_count = c;
unsigned source_windows_count = nof_bms_per_msm;
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
unsigned target_windows_count = 0;
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; // nof buckets per msm including zero buckets
unsigned target_windows_count;
P* source_buckets = buckets;
buckets = nullptr;
P* target_buckets;
P* temp_buckets1;
P* temp_buckets2;
for (unsigned i = 0;; i++) {
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // c/2=8
target_windows_count = source_windows_count << 1; // nof bms*2 = 32
const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // half the bits rounded up
target_windows_count = source_windows_count << 1; // twice the number of bms
const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream));
CHK_IF_RETURN(cudaMallocAsync(
&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets
&temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size,
stream)); // for type1 reduction (strided, bottom window - evens)
CHK_IF_RETURN(cudaMallocAsync(
&temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
CHK_IF_RETURN(cudaMallocAsync(
&temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
&temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size,
stream)); // for type2 reduction (serial, top window - odds)
initialize_buckets_kernel<<<(target_buckets_count * batch_size + 255) / 256, 256>>>(
target_buckets, target_buckets_count * batch_size); // initialization is needed for the odd c case
if (source_bits_count > 0) {
for (unsigned j = 0; j < target_bits_count; j++) {
const bool is_first_iter = (j == 0);
const bool is_last_iter = (j == target_bits_count - 1);
unsigned nof_threads =
(((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
for (unsigned j = 0; j < target_bits_count; j++) {
const bool is_first_iter = (j == 0);
const bool is_second_iter = (j == 1);
const bool is_last_iter = (j == target_bits_count - 1);
const bool is_odd_c = source_bits_count & 1;
unsigned nof_threads =
(((source_windows_count << target_bits_count) - source_windows_count) << (target_bits_count - 1 - j)) *
batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by nof threads
// per section
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
if (!is_odd_c || !is_first_iter) { // skip if c is odd and it's the first iteration
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
(1 << target_bits_count) - 1, nof_threads);
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
is_first_iter || (is_second_iter && is_odd_c) ? source_buckets : temp_buckets1,
is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j + (is_odd_c ? 1 : 0)),
is_last_iter ? 1 << target_bits_count : 0, 1 << target_bits_count, 0 /*=write_phase*/,
(1 << target_bits_count) - 1, nof_threads);
}
nof_threads = (((source_windows_count << (source_bits_count - target_bits_count)) - source_windows_count)
<< (target_bits_count - 1 - j)) *
batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by
// nof threads per section
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_reduction>>>(
is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0,
1 << (target_bits_count - (is_odd_c ? 1 : 0)), 1 /*=write_phase*/,
(1 << (target_bits_count - (is_odd_c ? 1 : 0))) - 1, nof_threads);
}
CHK_IF_RETURN(cudaEventRecord(event_finished_reduction, stream_reduction));
CHK_IF_RETURN(
cudaStreamWaitEvent(stream, event_finished_reduction)); // sync streams after every write to target_buckets
if (target_bits_count == 1) {
// Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed
// to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction
// ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b.
// precomputation and odd c can cause additional empty windows.
nof_final_results_per_msm = min(c * nof_bms_per_msm, bitsize);
nof_bms_per_msm = target_windows_count;
nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0;
nof_bms_in_batch = nof_bms_per_msm * batch_size;
unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * total_nof_final_results, stream));
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
NUM_THREADS = 32;
NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms_in_batch);
NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS;
last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
target_buckets, final_results, nof_final_results_per_msm, batch_size, nof_bms_per_msm, c);
c = 1;
CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream));
CHK_IF_RETURN(cudaFreeAsync(temp_buckets1, stream));
CHK_IF_RETURN(cudaFreeAsync(temp_buckets2, stream));
CHK_IF_RETURN(cudaStreamDestroy(stream_reduction));
break;
}
CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
@@ -803,8 +850,8 @@ namespace msm {
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
// launch the double and add kernel, a single thread per batch element
final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm,
nof_empty_bms_per_batch, c);
final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size,
nof_final_results_per_msm, c);
CHK_IF_RETURN(cudaFreeAsync(final_results, stream));
if (!are_results_on_device)
@@ -829,13 +876,7 @@ namespace msm {
const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize;
cudaStream_t& stream = config.ctx.stream;
unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16;
// reduce c to closest power of two (from below) if not using big_triangle reduction logic
// TODO: support arbitrary values of c
if (!config.is_big_triangle) {
while ((c & (c - 1)) != 0)
c &= (c - 1);
}
unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
return CHK_STICKY(bucket_method_msm(
bitsize, c, scalars, points, config.batch_size, msm_size,
@@ -846,7 +887,33 @@ namespace msm {
}
template <typename A, typename P>
cudaError_t precompute_msm_bases(
cudaError_t precompute_msm_points(A* points, int msm_size, MSMConfig& config, A* output_points)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
CHK_IF_RETURN(cudaMemcpyAsync(
output_points, points, sizeof(A) * config.points_size,
config.are_points_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream));
unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1;
unsigned shift = c * ((total_nof_bms - 1) / config.precompute_factor + 1);
unsigned NUM_THREADS = 1 << 8;
unsigned NUM_BLOCKS = (config.points_size + NUM_THREADS - 1) / NUM_THREADS;
for (int i = 1; i < config.precompute_factor; i++) {
left_shift_kernel<A, P><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
&output_points[(i - 1) * config.points_size], shift, config.points_size,
&output_points[i * config.points_size]);
}
return CHK_LAST();
}
template <typename A, typename P>
[[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
A* bases,
int bases_size,
int precompute_factor,

View File

@@ -1,3 +1,9 @@
#include "fields/id.h"
// #define FIELD_ID 2
#define CURVE_ID 3
#include "curves/curve_config.cuh"
// #include "fields/field_config.cuh"
#include "msm.cu"
#include <chrono>
@@ -9,7 +15,7 @@
#include "curves/projective.cuh"
#include "gpu-utils/device_context.cuh"
using namespace bn254;
// using namespace bn254;
class Dummy_Scalar
{
@@ -111,20 +117,34 @@ public:
// switch between dummy and real:
typedef scalar_t test_scalar;
typedef projective_t test_projective;
typedef affine_t test_affine;
// typedef scalar_t test_scalar;
// typedef projective_t test_projective;
// typedef affine_t test_affine;
typedef curve_config::scalar_t test_scalar;
typedef curve_config::projective_t test_projective;
typedef curve_config::affine_t test_affine;
// typedef Dummy_Scalar test_scalar;
// typedef Dummy_Projective test_projective;
// typedef Dummy_Projective test_affine;
int main()
int main(int argc, char** argv)
{
int batch_size = 1;
cudaEvent_t start, stop;
float msm_time;
int msm_log_size = (argc > 1) ? atoi(argv[1]) : 17;
int msm_size = 1 << msm_log_size;
int batch_size = (argc > 2) ? atoi(argv[2]) : 4;
// unsigned msm_size = 1<<21;
int msm_size = 12180757;
int N = batch_size * msm_size;
int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1;
int user_c = (argc > 4) ? atoi(argv[4]) : 15;
printf(
"running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n", CURVE_ID, msm_log_size, batch_size,
precomp_factor, user_c);
test_scalar* scalars = new test_scalar[N];
test_affine* points = new test_affine[N];
@@ -136,7 +156,8 @@ int main()
// projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
// test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
test_projective large_res[2];
test_projective res[batch_size];
test_projective ref[batch_size];
// test_projective batched_large_res[batch_size];
// fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
// fake_point batched_large_res[256];
@@ -149,13 +170,17 @@ int main()
test_scalar* scalars_d;
test_affine* points_d;
test_projective* large_res_d;
test_affine* precomp_points_d;
test_projective* res_d;
test_projective* ref_d;
cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
cudaMalloc(&large_res_d, sizeof(test_projective));
cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
cudaMalloc(&scalars_d, sizeof(test_scalar) * N);
cudaMalloc(&points_d, sizeof(test_affine) * N);
cudaMalloc(&precomp_points_d, sizeof(test_affine) * N * precomp_factor);
cudaMalloc(&res_d, sizeof(test_projective) * batch_size);
cudaMalloc(&ref_d, sizeof(test_projective) * batch_size);
cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice);
cudaMemcpy(points_d, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice);
std::cout << "finished copying" << std::endl;
@@ -170,65 +195,93 @@ int main()
0, // mempool
};
msm::MSMConfig config = {
ctx, // DeviceContext
0, // points_size
1, // precompute_factor
0, // c
0, // bitsize
10, // large_bucket_factor
1, // batch_size
false, // are_scalars_on_device
false, // are_scalars_montgomery_form
false, // are_points_on_device
false, // are_points_montgomery_form
true, // are_results_on_device
false, // is_big_triangle
true, // is_async
ctx, // DeviceContext
N, // points_size
precomp_factor, // precompute_factor
user_c, // c
0, // bitsize
10, // large_bucket_factor
batch_size, // batch_size
false, // are_scalars_on_device
false, // are_scalars_montgomery_form
true, // are_points_on_device
false, // are_points_montgomery_form
true, // are_results_on_device
false, // is_big_triangle
true, // is_async
// false, // segments_reduction
};
auto begin1 = std::chrono::high_resolution_clock::now();
msm::msm<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
cudaEvent_t msm_end_event;
cudaEventCreate(&msm_end_event);
auto end1 = std::chrono::high_resolution_clock::now();
auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
cudaEventCreate(&start);
cudaEventCreate(&stop);
if (precomp_factor > 1)
msm::precompute_msm_points<test_affine, test_projective>(points_d, msm_size, config, precomp_points_d);
// warm up
msm::msm<test_scalar, test_affine, test_projective>(
scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
cudaDeviceSynchronize();
// auto begin1 = std::chrono::high_resolution_clock::now();
cudaEventRecord(start, stream);
msm::msm<test_scalar, test_affine, test_projective>(
scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
cudaEventRecord(stop, stream);
cudaStreamSynchronize(stream);
cudaEventElapsedTime(&msm_time, start, stop);
// cudaEvent_t msm_end_event;
// cudaEventCreate(&msm_end_event);
// auto end1 = std::chrono::high_resolution_clock::now();
// auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
printf("msm time : %.3f ms.\n", msm_time);
// reference
config.c = 16;
config.precompute_factor = 1;
config.is_big_triangle = true;
config.are_results_on_device = false;
cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1])
<< std::endl;
auto begin = std::chrono::high_resolution_clock::now();
msm::msm<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
config.batch_size = 1;
config.points_size = msm_size;
// config.segments_reduction = false;
for (int i = 0; i < batch_size; i++) {
msm::msm<test_scalar, test_affine, test_projective>(
scalars + i * msm_size, points_d + i * msm_size, msm_size, config, ref_d + i);
}
// config.are_results_on_device = false;
// std::cout << test_projective::to_affine(large_res[0]) << std::endl;
// auto begin = std::chrono::high_resolution_clock::now();
// msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
// test_reduce_triangle(scalars);
// test_reduce_rectangle(scalars);
// test_reduce_single(scalars);
// test_reduce_var(scalars);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
// auto end = std::chrono::high_resolution_clock::now();
// auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
// printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
// std::cout << test_projective::to_affine(large_res[0]) << std::endl;
cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
// reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
// std::cout<<"final results batched large"<<std::endl;
// bool success = true;
// for (unsigned i = 0; i < batch_size; i++)
// {
// std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
// if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
// std::cout<<"good"<<std::endl;
// }
// else{
// std::cout<<"miss"<<std::endl;
// std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
// success = false;
// }
// }
// if (success){
// std::cout<<"success!"<<std::endl;
// }
bool success = true;
for (unsigned i = 0; i < batch_size; i++) {
std::cout << test_projective::to_affine(res[i]) << std::endl;
if (test_projective::to_affine(res[i]) == test_projective::to_affine(ref[i])) {
std::cout << "good" << std::endl;
} else {
std::cout << "miss" << std::endl;
std::cout << test_projective::to_affine(ref[i]) << std::endl;
success = false;
}
}
if (success) { std::cout << "success!" << std::endl; }
// std::cout<<batched_large_res[0]<<std::endl;
// std::cout<<batched_large_res[1]<<std::endl;
@@ -239,4 +292,4 @@ int main()
// std::cout<<pr<<std::endl;
return 0;
}
}

View File

@@ -116,13 +116,13 @@ func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfi
return scalars.AsUnsafePointer(), points.AsUnsafePointer(), results.AsUnsafePointer(), size, unsafe.Pointer(cfg)
}
func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
func PrecomputePointsCheck(points HostOrDeviceSlice, cfg *MSMConfig, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
outputBasesLength, pointsLength := outputBases.Len(), points.Len()
if outputBasesLength != pointsLength*int(precomputeFactor) {
if outputBasesLength != pointsLength*int(cfg.PrecomputeFactor) {
errorString := fmt.Sprintf(
"Precompute factor is probably incorrect: expected %d but got %d",
outputBasesLength/pointsLength,
precomputeFactor,
cfg.PrecomputeFactor,
)
panic(errorString)
}
@@ -131,5 +131,8 @@ func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outp
points.(DeviceSlice).CheckDevice()
}
cfg.pointsSize = int32(pointsLength)
cfg.arePointsOnDevice = points.IsOnDevice()
return points.AsUnsafePointer(), outputBases.AsUnsafePointer()
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bls12_377_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
cudaError_t bls12_377_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
cudaError_t bls12_377_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
return err
}
// Deprecated: G2PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// G2PrecomputePoints should be used instead.
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := G2GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
err := (cr.CudaError)(__ret)
return err
}
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
__ret := C.bls12_377_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bls12_377_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
cudaError_t bls12_377_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
cudaError_t bls12_377_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
return err
}
// Deprecated: PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// PrecomputePoints should be used instead.
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
err := (cr.CudaError)(__ret)
return err
}
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.affine_t)(outputBasesPointer)
__ret := C.bls12_377_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
}
}
func TestPrecomputeBaseG2(t *testing.T) {
func TestPrecomputePointsG2(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p g2.G2Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
}
}
func TestPrecomputeBase(t *testing.T) {
func TestPrecomputePoints(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p icicleBls12_377.Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = msm.Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[icicleBls12_377.Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bls12_381_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
cudaError_t bls12_381_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
cudaError_t bls12_381_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
return err
}
// Deprecated: G2PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// G2PrecomputePoints should be used instead.
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := G2GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
err := (cr.CudaError)(__ret)
return err
}
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
__ret := C.bls12_381_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bls12_381_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
cudaError_t bls12_381_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
cudaError_t bls12_381_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
return err
}
// Deprecated: PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// PrecomputePoints should be used instead.
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
err := (cr.CudaError)(__ret)
return err
}
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.affine_t)(outputBasesPointer)
__ret := C.bls12_381_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
}
}
func TestPrecomputeBaseG2(t *testing.T) {
func TestPrecomputePointsG2(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p g2.G2Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
}
}
func TestPrecomputeBase(t *testing.T) {
func TestPrecomputePoints(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p icicleBls12_381.Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = msm.Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[icicleBls12_381.Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bn254_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
cudaError_t bn254_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
cudaError_t bn254_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
return err
}
// Deprecated: G2PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// G2PrecomputePoints should be used instead.
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := G2GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
err := (cr.CudaError)(__ret)
return err
}
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
__ret := C.bn254_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bn254_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
cudaError_t bn254_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
cudaError_t bn254_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
return err
}
// Deprecated: PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// PrecomputePoints should be used instead.
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
err := (cr.CudaError)(__ret)
return err
}
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.affine_t)(outputBasesPointer)
__ret := C.bn254_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
}
}
func TestPrecomputeBaseG2(t *testing.T) {
func TestPrecomputePointsG2(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p g2.G2Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
}
}
func TestPrecomputeBase(t *testing.T) {
func TestPrecomputePoints(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p icicleBn254.Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = msm.Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[icicleBn254.Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bw6_761_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
cudaError_t bw6_761_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
cudaError_t bw6_761_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
return err
}
// Deprecated: G2PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// G2PrecomputePoints should be used instead.
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := G2GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
err := (cr.CudaError)(__ret)
return err
}
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.g2_affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
__ret := C.bw6_761_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t bw6_761_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
cudaError_t bw6_761_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
cudaError_t bw6_761_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
return err
}
// Deprecated: PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// PrecomputePoints should be used instead.
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
err := (cr.CudaError)(__ret)
return err
}
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.affine_t)(outputBasesPointer)
__ret := C.bw6_761_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -170,9 +170,11 @@ func TestMSMG2Batch(t *testing.T) {
}
}
func TestPrecomputeBaseG2(t *testing.T) {
func TestPrecomputePointsG2(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p g2.G2Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
}
}
func TestPrecomputeBase(t *testing.T) {
func TestPrecomputePoints(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p icicleBw6_761.Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = msm.Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[icicleBw6_761.Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t grumpkin_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
cudaError_t grumpkin_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
cudaError_t grumpkin_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
#ifdef __cplusplus
}

View File

@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
return err
}
// Deprecated: PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// PrecomputePoints should be used instead.
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
err := (cr.CudaError)(__ret)
return err
}
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.affine_t)(outputBasesPointer)
__ret := C.grumpkin_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -64,9 +64,11 @@ func TestMSMBatch(t *testing.T) {
}
}
func TestPrecomputeBase(t *testing.T) {
func TestPrecomputePoints(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -76,20 +78,18 @@ func TestPrecomputeBase(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p icicleGrumpkin.Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = msm.Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[icicleGrumpkin.Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -28,8 +28,13 @@ func {{.CurvePrefix}}Msm(scalars core.HostOrDeviceSlice, points core.HostOrDevic
return err
}
// Deprecated: {{.CurvePrefix}}PrecomputeBases exists for backward compatibility.
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
// {{.CurvePrefix}}PrecomputePoints should be used instead.
func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
cfg := {{.CurvePrefix}}GetDefaultMSMConfig()
cfg.PrecomputeFactor = precomputeFactor
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFa
err := (cr.CudaError)(__ret)
return err
}
func {{.CurvePrefix}}PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
cMsmSize := (C.int)(msmSize)
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
cOutputBases := (*C.{{toCName .CurvePrefix}}affine_t)(outputBasesPointer)
__ret := C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
err := (cr.CudaError)(__ret)
return err
}

View File

@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_msm_cuda(const scalar_t* scalars,const {{toCName .CurvePrefix}}affine_t* points, int count, MSMConfig* config, {{toCName .CurvePrefix}}projective_t* out);
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_bases_cuda({{toCName .CurvePrefix}}affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, {{toCName .CurvePrefix}}affine_t* out);
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda({{toCName .CurvePrefix}}affine_t* points, int msm_size, MSMConfig* config, {{toCName .CurvePrefix}}affine_t* out);
#ifdef __cplusplus
}

View File

@@ -251,9 +251,11 @@ func TestMSM{{.CurvePrefix}}Batch(t *testing.T) {
}
}
func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {
func TestPrecomputePoints{{.CurvePrefix}}(t *testing.T) {
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
const precomputeFactor = 8
cfg.PrecomputeFactor = precomputeFactor
for _, power := range []int{10, 16} {
for _, batchSize := range []int{1, 3, 16} {
size := 1 << power
@@ -263,20 +265,18 @@ func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {
var precomputeOut core.DeviceSlice
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputePoints(points, size, &cfg, precomputeOut)
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
var out core.DeviceSlice
_, e = out.Malloc(batchSize*p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
cfg.PrecomputeFactor = precomputeFactor
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, precomputeOut, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], batchSize)
outHost.CopyFromDevice(&out)
out.Free()

View File

@@ -105,6 +105,13 @@ pub trait MSM<C: Curve> {
ctx: &DeviceContext,
output_bases: &mut DeviceSlice<Affine<C>>,
) -> IcicleResult<()>;
fn precompute_points_unchecked(
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
msm_size: i32,
cfg: &MSMConfig,
output_bases: &mut DeviceSlice<Affine<C>>,
) -> IcicleResult<()>;
}
/// Computes the multi-scalar multiplication, or MSM: `s1*P1 + s2*P2 + ... + sn*Pn`, or a batch of several MSMs.
@@ -188,7 +195,7 @@ pub fn msm<C: Curve + MSM<C>>(
/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
///
/// * `bases` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
/// * `points` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
///
/// * `precompute_factor` - The number of total precomputed points for each base (including the base itself).
///
@@ -201,6 +208,7 @@ pub fn msm<C: Curve + MSM<C>>(
/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
///
/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
#[deprecated(since = "2.5.0", note = "Please use `precompute_points` instead")]
pub fn precompute_bases<C: Curve + MSM<C>>(
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
precompute_factor: i32,
@@ -220,6 +228,55 @@ pub fn precompute_bases<C: Curve + MSM<C>>(
C::precompute_bases_unchecked(points, precompute_factor, _c, ctx, output_bases)
}
/// A function that precomputes MSM bases by extending them with their shifted copies.
/// e.g.:
/// Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
///
/// * `points` - Base points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
///
/// * `msm_size` - The size of a single MSM to compute.
///
/// * `cfg` - config used to specify extra arguments of the MSM. NOTE: You should update the precompute_factor
/// and potentially the c value inside this config. This config should be the same config used in msm.
///
/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
///
/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
pub fn precompute_points<C: Curve + MSM<C>>(
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
msm_size: i32,
cfg: &MSMConfig,
output_bases: &mut DeviceSlice<Affine<C>>,
) -> IcicleResult<()> {
let precompute_factor = cfg.precompute_factor;
assert_eq!(
output_bases.len(),
points.len() * (precompute_factor as usize),
"Precompute factor is probably incorrect: expected {} but got {}",
output_bases.len() / points.len(),
precompute_factor
);
assert!(output_bases.is_on_device());
let ctx_device_id = cfg
.ctx
.device_id;
if let Some(device_id) = points.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in points and context are different"
);
}
check_device(ctx_device_id);
let mut local_cfg = cfg.clone();
local_cfg.points_size = points.len() as i32;
local_cfg.are_points_on_device = points.is_on_device();
C::precompute_points_unchecked(points, msm_size, &local_cfg, output_bases)
}
#[macro_export]
macro_rules! impl_msm {
(
@@ -250,6 +307,14 @@ macro_rules! impl_msm {
ctx: &DeviceContext,
output_bases: *mut Affine<$curve>,
) -> CudaError;
#[link_name = concat!($curve_prefix, "_precompute_msm_points_cuda")]
pub(crate) fn precompute_points_cuda(
points: *const Affine<$curve>,
msm_size: i32,
cfg: &MSMConfig,
output_bases: *mut Affine<$curve>,
) -> CudaError;
}
}
@@ -292,6 +357,23 @@ macro_rules! impl_msm {
.wrap()
}
}
fn precompute_points_unchecked(
points: &(impl HostOrDeviceSlice<Affine<$curve>> + ?Sized),
msm_size: i32,
cfg: &MSMConfig,
output_bases: &mut DeviceSlice<Affine<$curve>>,
) -> IcicleResult<()> {
unsafe {
$curve_prefix_indent::precompute_points_cuda(
points.as_ptr(),
msm_size,
cfg,
output_bases.as_mut_ptr(),
)
.wrap()
}
}
}
};
}
@@ -363,7 +445,7 @@ macro_rules! impl_msm_bench {
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
use icicle_core::msm::precompute_bases;
use icicle_core::msm::precompute_points;
use icicle_core::msm::tests::generate_random_affine_points_with_zeroes;
use icicle_cuda_runtime::stream::CudaStream;
@@ -396,11 +478,11 @@ macro_rules! impl_msm_bench {
let points = generate_random_affine_points_with_zeroes(test_size, 10);
for precompute_factor in [1, 4, 8] {
let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
precompute_bases(
cfg.precompute_factor = precompute_factor as i32;
precompute_points(
HostSlice::from_slice(&points),
precompute_factor as i32,
0,
&cfg.ctx,
test_size as i32,
&cfg,
&mut precomputed_points_d,
)
.unwrap();
@@ -419,7 +501,6 @@ macro_rules! impl_msm_bench {
let scalars_h = HostSlice::from_slice(&scalars);
let mut msm_results = DeviceVec::<Projective<C>>::cuda_malloc(batch_size).unwrap();
cfg.precompute_factor = precompute_factor as i32;
let bench_descr = format!(
" {} x {} with precomp = {:?}",

View File

@@ -1,5 +1,5 @@
use crate::curve::{Affine, Curve, Projective};
use crate::msm::{msm, precompute_bases, MSMConfig, MSM};
use crate::msm::{msm, precompute_points, MSMConfig, MSM};
use crate::traits::{FieldImpl, GenerateRandom};
use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
@@ -120,16 +120,16 @@ where
cfg.is_async = true;
cfg.large_bucket_factor = 5;
cfg.c = 4;
let precompute_factor = 8;
warmup(&stream).unwrap();
for test_size in test_sizes {
let precompute_factor = 8;
let points = generate_random_affine_points_with_zeroes(test_size, 10);
let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
precompute_bases(
cfg.precompute_factor = precompute_factor as i32;
precompute_points(
HostSlice::from_slice(&points),
precompute_factor as i32,
0,
&cfg.ctx,
test_size as i32,
&cfg,
&mut precomputed_points_d,
)
.unwrap();