mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-09 13:07:59 -05:00
MSM - supporting all window sizes (#534)
This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
This commit is contained in:
@@ -18,11 +18,8 @@
|
||||
|
||||
extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
|
||||
bls12_377::g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bls12_377::g2_affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bls12_377_g2_msm_cuda(
|
||||
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(
|
||||
|
||||
extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
|
||||
bls12_377::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bls12_377::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bls12_377_msm_cuda(
|
||||
|
||||
@@ -18,11 +18,8 @@
|
||||
|
||||
extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
|
||||
bls12_381::g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bls12_381::g2_affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bls12_381_g2_msm_cuda(
|
||||
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(
|
||||
|
||||
extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
|
||||
bls12_381::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bls12_381::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bls12_381_msm_cuda(
|
||||
|
||||
@@ -19,11 +19,8 @@
|
||||
|
||||
extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
|
||||
bn254::g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bn254::g2_affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bn254_g2_msm_cuda(
|
||||
@@ -31,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(
|
||||
|
||||
extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
|
||||
bn254::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bn254::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bn254_msm_cuda(
|
||||
|
||||
@@ -18,11 +18,8 @@
|
||||
|
||||
extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
|
||||
bw6_761::g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bw6_761::g2_affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bw6_761_g2_msm_cuda(
|
||||
@@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(
|
||||
|
||||
extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
|
||||
bw6_761::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
bw6_761::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t bw6_761_msm_cuda(
|
||||
|
||||
@@ -17,11 +17,8 @@
|
||||
|
||||
extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
|
||||
grumpkin::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
grumpkin::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t grumpkin_msm_cuda(
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
|
||||
${CURVE}::affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
${CURVE}::affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t ${CURVE}_msm_cuda(
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
|
||||
${CURVE}::g2_affine_t* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
int _c,
|
||||
bool are_bases_on_device,
|
||||
device_context::DeviceContext& ctx,
|
||||
int msm_size,
|
||||
msm::MSMConfig& config,
|
||||
${CURVE}::g2_affine_t* output_bases);
|
||||
|
||||
extern "C" cudaError_t ${CURVE}_g2_msm_cuda(
|
||||
|
||||
@@ -43,7 +43,7 @@ namespace msm {
|
||||
* points, it should be set to the product of MSM size and [batch_size](@ref
|
||||
* batch_size). Default value: 0 (meaning it's equal to the MSM size). */
|
||||
int precompute_factor; /**< The number of extra points to pre-compute for each point. See the
|
||||
* [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
|
||||
* [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
|
||||
* there needs to be equal to the one used here. Larger values decrease the
|
||||
* number of computations to make, on-line memory footprint, but increase the static
|
||||
* memory footprint. Default value: 1 (i.e. don't pre-compute). */
|
||||
@@ -52,7 +52,7 @@ namespace msm {
|
||||
* means more on-line memory footprint but also more parallelism and less computational
|
||||
* complexity (up to a certain point). Currently pre-computation is independent of
|
||||
* \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
|
||||
* [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
|
||||
* [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
|
||||
* Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically). */
|
||||
int bitsize; /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
|
||||
* but if a different (better) upper bound is known, it should be reflected in this
|
||||
@@ -127,6 +127,26 @@ namespace msm {
|
||||
template <typename S, typename A, typename P>
|
||||
cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
|
||||
|
||||
/**
|
||||
* A function that precomputes MSM bases by extending them with their shifted copies.
|
||||
* e.g.:
|
||||
* Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
|
||||
* Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
|
||||
* 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
|
||||
* @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
|
||||
* @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
|
||||
* the size of 1 MSM.
|
||||
* @param config [MSMConfig](@ref MSMConfig) used in this MSM.
|
||||
* @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended
|
||||
* points.
|
||||
* @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
|
||||
* Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*
|
||||
*/
|
||||
template <typename A, typename P>
|
||||
cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points);
|
||||
|
||||
/**
|
||||
* A function that precomputes MSM bases by extending them with their shifted copies.
|
||||
* e.g.:
|
||||
@@ -148,7 +168,7 @@ namespace msm {
|
||||
*
|
||||
*/
|
||||
template <typename A, typename P>
|
||||
cudaError_t precompute_msm_bases(
|
||||
[[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
|
||||
A* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
build_msm:
|
||||
mkdir -p work
|
||||
nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
|
||||
|
||||
test_msm:
|
||||
mkdir -p work
|
||||
nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu
|
||||
work/test_msm
|
||||
nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
|
||||
work/test_msm
|
||||
@@ -8,6 +8,17 @@ using namespace field_config;
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace msm {
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `A` is the [affine representation](@ref affine_t) of curve points;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)(
|
||||
affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points)
|
||||
{
|
||||
return precompute_msm_points<affine_t, projective_t>(points, msm_size, config, output_points);
|
||||
}
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
|
||||
@@ -8,6 +8,17 @@ using namespace field_config;
|
||||
#include "utils/utils.h"
|
||||
|
||||
namespace msm {
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
* - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
|
||||
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
|
||||
*/
|
||||
extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_points_cuda)(
|
||||
g2_affine_t* points, int msm_size, MSMConfig& config, g2_affine_t* output_points)
|
||||
{
|
||||
return precompute_msm_points<g2_affine_t, g2_projective_t>(points, msm_size, config, output_points);
|
||||
}
|
||||
/**
|
||||
* Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
|
||||
* template parameters (where the curve is given by `-DCURVE` env variable during build):
|
||||
@@ -26,7 +37,6 @@ namespace msm {
|
||||
return precompute_msm_bases<g2_affine_t, g2_projective_t>(
|
||||
bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extern "C" version of [msm](@ref msm) function with the following values of template parameters
|
||||
* (where the curve is given by `-DCURVE` env variable during build):
|
||||
|
||||
@@ -22,7 +22,6 @@ namespace msm {
|
||||
|
||||
#define MAX_TH 256
|
||||
|
||||
// #define SIGNED_DIG //WIP
|
||||
// #define SSM_SUM //WIP
|
||||
|
||||
template <typename A, typename P>
|
||||
@@ -74,10 +73,10 @@ namespace msm {
|
||||
unsigned* bucket_sizes_sum,
|
||||
unsigned* bucket_sizes,
|
||||
unsigned* large_bucket_thread_indices,
|
||||
unsigned num_of_threads)
|
||||
unsigned nof_threads)
|
||||
{
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= num_of_threads) { return; }
|
||||
if (tid >= nof_threads) { return; }
|
||||
|
||||
unsigned large_bucket_tid = large_bucket_thread_indices[tid];
|
||||
unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
|
||||
@@ -91,12 +90,13 @@ namespace msm {
|
||||
P* v_r,
|
||||
unsigned block_size,
|
||||
unsigned write_stride,
|
||||
unsigned buckets_per_bm,
|
||||
unsigned write_phase,
|
||||
unsigned step,
|
||||
unsigned num_of_threads)
|
||||
unsigned nof_threads)
|
||||
{
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= num_of_threads) { return; }
|
||||
if (tid >= nof_threads) return;
|
||||
|
||||
// we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
|
||||
// for write_phase==1, the read pattern is different so we don't skip over anything.
|
||||
@@ -110,7 +110,7 @@ namespace msm {
|
||||
const unsigned read_ind = block_size * shifted_block_id + block_tid;
|
||||
const unsigned write_ind = jump * shifted_block_id + block_tid;
|
||||
const unsigned v_r_key =
|
||||
write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
|
||||
write_stride ? ((write_ind / buckets_per_bm) * 2 + write_phase) * write_stride + write_ind % buckets_per_bm
|
||||
: write_ind;
|
||||
v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
|
||||
}
|
||||
@@ -325,32 +325,50 @@ namespace msm {
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
__global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums)
|
||||
__global__ void last_pass_kernel(
|
||||
const P* final_buckets,
|
||||
P* final_sums,
|
||||
unsigned nof_sums_per_batch,
|
||||
unsigned batch_size,
|
||||
unsigned nof_bms_per_batch,
|
||||
unsigned orig_c)
|
||||
{
|
||||
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid >= num_sums) return;
|
||||
final_sums[tid] = final_buckets[2 * tid + 1];
|
||||
if (tid >= nof_sums_per_batch * batch_size) return;
|
||||
unsigned batch_index = tid / nof_sums_per_batch;
|
||||
unsigned batch_tid = tid % nof_sums_per_batch;
|
||||
unsigned bm_index = batch_tid / orig_c;
|
||||
unsigned bm_tid = batch_tid % orig_c;
|
||||
for (unsigned c = orig_c; c > 1;) {
|
||||
c = (c + 1) >> 1;
|
||||
bm_index <<= 1;
|
||||
if (bm_tid >= c) {
|
||||
bm_index++;
|
||||
bm_tid -= c;
|
||||
}
|
||||
}
|
||||
final_sums[tid] = final_buckets[2 * (batch_index * nof_bms_per_batch + bm_index) + 1];
|
||||
}
|
||||
|
||||
// this kernel computes the final result using the double and add algorithm
|
||||
// it is done by a single thread
|
||||
template <typename P, typename S>
|
||||
__global__ void final_accumulation_kernel(
|
||||
const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned nof_empty_bms, unsigned c)
|
||||
const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
|
||||
{
|
||||
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (tid >= nof_msms) return;
|
||||
P final_result = P::zero();
|
||||
// Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
|
||||
// skipping them.
|
||||
for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
|
||||
final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
|
||||
for (unsigned j = 0; j < c; j++) // double
|
||||
for (unsigned i = nof_results; i > 1; i--) {
|
||||
final_result = final_result + final_sums[i - 1 + tid * nof_results]; // add
|
||||
for (unsigned j = 0; j < c; j++) // double
|
||||
{
|
||||
final_result = final_result + final_result;
|
||||
}
|
||||
}
|
||||
final_results[tid] = final_result + final_sums[tid * nof_bms];
|
||||
final_results[tid] = final_result + final_sums[tid * nof_results];
|
||||
}
|
||||
|
||||
// this function computes msm using the bucket method
|
||||
@@ -717,7 +735,8 @@ namespace msm {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream));
|
||||
|
||||
// --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window ---
|
||||
unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0
|
||||
unsigned nof_final_results_per_msm =
|
||||
nof_bms_per_msm; // for big-triangle accumluation this is the number of bucket modules
|
||||
P* final_results;
|
||||
if (is_big_triangle || c == 1) {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
|
||||
@@ -726,62 +745,90 @@ namespace msm {
|
||||
NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
|
||||
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms_in_batch, c);
|
||||
} else {
|
||||
// the recursive reduction algorithm works with 2 types of reduction that can run on parallel streams
|
||||
cudaStream_t stream_reduction;
|
||||
cudaEvent_t event_finished_reduction;
|
||||
CHK_IF_RETURN(cudaStreamCreate(&stream_reduction));
|
||||
CHK_IF_RETURN(cudaEventCreateWithFlags(&event_finished_reduction, cudaEventDisableTiming));
|
||||
|
||||
unsigned source_bits_count = c;
|
||||
unsigned source_windows_count = nof_bms_per_msm;
|
||||
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
|
||||
unsigned target_windows_count = 0;
|
||||
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; // nof buckets per msm including zero buckets
|
||||
unsigned target_windows_count;
|
||||
P* source_buckets = buckets;
|
||||
buckets = nullptr;
|
||||
P* target_buckets;
|
||||
P* temp_buckets1;
|
||||
P* temp_buckets2;
|
||||
for (unsigned i = 0;; i++) {
|
||||
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // c/2=8
|
||||
target_windows_count = source_windows_count << 1; // nof bms*2 = 32
|
||||
const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
|
||||
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // half the bits rounded up
|
||||
target_windows_count = source_windows_count << 1; // twice the number of bms
|
||||
const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
|
||||
CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream));
|
||||
CHK_IF_RETURN(cudaMallocAsync(
|
||||
&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets
|
||||
&temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size,
|
||||
stream)); // for type1 reduction (strided, bottom window - evens)
|
||||
CHK_IF_RETURN(cudaMallocAsync(
|
||||
&temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
|
||||
CHK_IF_RETURN(cudaMallocAsync(
|
||||
&temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
|
||||
&temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size,
|
||||
stream)); // for type2 reduction (serial, top window - odds)
|
||||
initialize_buckets_kernel<<<(target_buckets_count * batch_size + 255) / 256, 256>>>(
|
||||
target_buckets, target_buckets_count * batch_size); // initialization is needed for the odd c case
|
||||
|
||||
if (source_bits_count > 0) {
|
||||
for (unsigned j = 0; j < target_bits_count; j++) {
|
||||
const bool is_first_iter = (j == 0);
|
||||
const bool is_last_iter = (j == target_bits_count - 1);
|
||||
unsigned nof_threads =
|
||||
(((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
|
||||
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
|
||||
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
|
||||
for (unsigned j = 0; j < target_bits_count; j++) {
|
||||
const bool is_first_iter = (j == 0);
|
||||
const bool is_second_iter = (j == 1);
|
||||
const bool is_last_iter = (j == target_bits_count - 1);
|
||||
const bool is_odd_c = source_bits_count & 1;
|
||||
unsigned nof_threads =
|
||||
(((source_windows_count << target_bits_count) - source_windows_count) << (target_bits_count - 1 - j)) *
|
||||
batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by nof threads
|
||||
// per section
|
||||
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
|
||||
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
|
||||
if (!is_odd_c || !is_first_iter) { // skip if c is odd and it's the first iteration
|
||||
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
|
||||
is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
|
||||
1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
|
||||
(1 << target_bits_count) - 1, nof_threads);
|
||||
|
||||
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
|
||||
is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
|
||||
1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
|
||||
is_first_iter || (is_second_iter && is_odd_c) ? source_buckets : temp_buckets1,
|
||||
is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j + (is_odd_c ? 1 : 0)),
|
||||
is_last_iter ? 1 << target_bits_count : 0, 1 << target_bits_count, 0 /*=write_phase*/,
|
||||
(1 << target_bits_count) - 1, nof_threads);
|
||||
}
|
||||
|
||||
nof_threads = (((source_windows_count << (source_bits_count - target_bits_count)) - source_windows_count)
|
||||
<< (target_bits_count - 1 - j)) *
|
||||
batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by
|
||||
// nof threads per section
|
||||
NUM_THREADS = max(1, min(MAX_TH, nof_threads));
|
||||
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
|
||||
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_reduction>>>(
|
||||
is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
|
||||
1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0,
|
||||
1 << (target_bits_count - (is_odd_c ? 1 : 0)), 1 /*=write_phase*/,
|
||||
(1 << (target_bits_count - (is_odd_c ? 1 : 0))) - 1, nof_threads);
|
||||
}
|
||||
CHK_IF_RETURN(cudaEventRecord(event_finished_reduction, stream_reduction));
|
||||
CHK_IF_RETURN(
|
||||
cudaStreamWaitEvent(stream, event_finished_reduction)); // sync streams after every write to target_buckets
|
||||
if (target_bits_count == 1) {
|
||||
// Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed
|
||||
// to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction
|
||||
// ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b.
|
||||
// precomputation and odd c can cause additional empty windows.
|
||||
nof_final_results_per_msm = min(c * nof_bms_per_msm, bitsize);
|
||||
nof_bms_per_msm = target_windows_count;
|
||||
nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0;
|
||||
nof_bms_in_batch = nof_bms_per_msm * batch_size;
|
||||
unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
|
||||
|
||||
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * total_nof_final_results, stream));
|
||||
|
||||
CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
|
||||
NUM_THREADS = 32;
|
||||
NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
|
||||
last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms_in_batch);
|
||||
NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS;
|
||||
last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
|
||||
target_buckets, final_results, nof_final_results_per_msm, batch_size, nof_bms_per_msm, c);
|
||||
c = 1;
|
||||
CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(temp_buckets1, stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(temp_buckets2, stream));
|
||||
CHK_IF_RETURN(cudaStreamDestroy(stream_reduction));
|
||||
break;
|
||||
}
|
||||
CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
|
||||
@@ -803,8 +850,8 @@ namespace msm {
|
||||
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
// launch the double and add kernel, a single thread per batch element
|
||||
final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
|
||||
final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm,
|
||||
nof_empty_bms_per_batch, c);
|
||||
final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size,
|
||||
nof_final_results_per_msm, c);
|
||||
CHK_IF_RETURN(cudaFreeAsync(final_results, stream));
|
||||
|
||||
if (!are_results_on_device)
|
||||
@@ -829,13 +876,7 @@ namespace msm {
|
||||
const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize;
|
||||
cudaStream_t& stream = config.ctx.stream;
|
||||
|
||||
unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16;
|
||||
// reduce c to closest power of two (from below) if not using big_triangle reduction logic
|
||||
// TODO: support arbitrary values of c
|
||||
if (!config.is_big_triangle) {
|
||||
while ((c & (c - 1)) != 0)
|
||||
c &= (c - 1);
|
||||
}
|
||||
unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
|
||||
|
||||
return CHK_STICKY(bucket_method_msm(
|
||||
bitsize, c, scalars, points, config.batch_size, msm_size,
|
||||
@@ -846,7 +887,33 @@ namespace msm {
|
||||
}
|
||||
|
||||
template <typename A, typename P>
|
||||
cudaError_t precompute_msm_bases(
|
||||
cudaError_t precompute_msm_points(A* points, int msm_size, MSMConfig& config, A* output_points)
|
||||
{
|
||||
CHK_INIT_IF_RETURN();
|
||||
|
||||
cudaStream_t& stream = config.ctx.stream;
|
||||
unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
|
||||
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(
|
||||
output_points, points, sizeof(A) * config.points_size,
|
||||
config.are_points_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream));
|
||||
|
||||
unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1;
|
||||
unsigned shift = c * ((total_nof_bms - 1) / config.precompute_factor + 1);
|
||||
|
||||
unsigned NUM_THREADS = 1 << 8;
|
||||
unsigned NUM_BLOCKS = (config.points_size + NUM_THREADS - 1) / NUM_THREADS;
|
||||
for (int i = 1; i < config.precompute_factor; i++) {
|
||||
left_shift_kernel<A, P><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
|
||||
&output_points[(i - 1) * config.points_size], shift, config.points_size,
|
||||
&output_points[i * config.points_size]);
|
||||
}
|
||||
|
||||
return CHK_LAST();
|
||||
}
|
||||
|
||||
template <typename A, typename P>
|
||||
[[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
|
||||
A* bases,
|
||||
int bases_size,
|
||||
int precompute_factor,
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
#include "fields/id.h"
|
||||
// #define FIELD_ID 2
|
||||
#define CURVE_ID 3
|
||||
#include "curves/curve_config.cuh"
|
||||
// #include "fields/field_config.cuh"
|
||||
|
||||
#include "msm.cu"
|
||||
|
||||
#include <chrono>
|
||||
@@ -9,7 +15,7 @@
|
||||
#include "curves/projective.cuh"
|
||||
#include "gpu-utils/device_context.cuh"
|
||||
|
||||
using namespace bn254;
|
||||
// using namespace bn254;
|
||||
|
||||
class Dummy_Scalar
|
||||
{
|
||||
@@ -111,20 +117,34 @@ public:
|
||||
|
||||
// switch between dummy and real:
|
||||
|
||||
typedef scalar_t test_scalar;
|
||||
typedef projective_t test_projective;
|
||||
typedef affine_t test_affine;
|
||||
// typedef scalar_t test_scalar;
|
||||
// typedef projective_t test_projective;
|
||||
// typedef affine_t test_affine;
|
||||
|
||||
typedef curve_config::scalar_t test_scalar;
|
||||
typedef curve_config::projective_t test_projective;
|
||||
typedef curve_config::affine_t test_affine;
|
||||
|
||||
// typedef Dummy_Scalar test_scalar;
|
||||
// typedef Dummy_Projective test_projective;
|
||||
// typedef Dummy_Projective test_affine;
|
||||
|
||||
int main()
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int batch_size = 1;
|
||||
cudaEvent_t start, stop;
|
||||
float msm_time;
|
||||
|
||||
int msm_log_size = (argc > 1) ? atoi(argv[1]) : 17;
|
||||
int msm_size = 1 << msm_log_size;
|
||||
int batch_size = (argc > 2) ? atoi(argv[2]) : 4;
|
||||
// unsigned msm_size = 1<<21;
|
||||
int msm_size = 12180757;
|
||||
int N = batch_size * msm_size;
|
||||
int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1;
|
||||
int user_c = (argc > 4) ? atoi(argv[4]) : 15;
|
||||
|
||||
printf(
|
||||
"running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n", CURVE_ID, msm_log_size, batch_size,
|
||||
precomp_factor, user_c);
|
||||
|
||||
test_scalar* scalars = new test_scalar[N];
|
||||
test_affine* points = new test_affine[N];
|
||||
@@ -136,7 +156,8 @@ int main()
|
||||
|
||||
// projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
|
||||
// test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
|
||||
test_projective large_res[2];
|
||||
test_projective res[batch_size];
|
||||
test_projective ref[batch_size];
|
||||
// test_projective batched_large_res[batch_size];
|
||||
// fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
|
||||
// fake_point batched_large_res[256];
|
||||
@@ -149,13 +170,17 @@ int main()
|
||||
|
||||
test_scalar* scalars_d;
|
||||
test_affine* points_d;
|
||||
test_projective* large_res_d;
|
||||
test_affine* precomp_points_d;
|
||||
test_projective* res_d;
|
||||
test_projective* ref_d;
|
||||
|
||||
cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
|
||||
cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
|
||||
cudaMalloc(&large_res_d, sizeof(test_projective));
|
||||
cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
|
||||
cudaMalloc(&scalars_d, sizeof(test_scalar) * N);
|
||||
cudaMalloc(&points_d, sizeof(test_affine) * N);
|
||||
cudaMalloc(&precomp_points_d, sizeof(test_affine) * N * precomp_factor);
|
||||
cudaMalloc(&res_d, sizeof(test_projective) * batch_size);
|
||||
cudaMalloc(&ref_d, sizeof(test_projective) * batch_size);
|
||||
cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(points_d, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice);
|
||||
|
||||
std::cout << "finished copying" << std::endl;
|
||||
|
||||
@@ -170,65 +195,93 @@ int main()
|
||||
0, // mempool
|
||||
};
|
||||
msm::MSMConfig config = {
|
||||
ctx, // DeviceContext
|
||||
0, // points_size
|
||||
1, // precompute_factor
|
||||
0, // c
|
||||
0, // bitsize
|
||||
10, // large_bucket_factor
|
||||
1, // batch_size
|
||||
false, // are_scalars_on_device
|
||||
false, // are_scalars_montgomery_form
|
||||
false, // are_points_on_device
|
||||
false, // are_points_montgomery_form
|
||||
true, // are_results_on_device
|
||||
false, // is_big_triangle
|
||||
true, // is_async
|
||||
ctx, // DeviceContext
|
||||
N, // points_size
|
||||
precomp_factor, // precompute_factor
|
||||
user_c, // c
|
||||
0, // bitsize
|
||||
10, // large_bucket_factor
|
||||
batch_size, // batch_size
|
||||
false, // are_scalars_on_device
|
||||
false, // are_scalars_montgomery_form
|
||||
true, // are_points_on_device
|
||||
false, // are_points_montgomery_form
|
||||
true, // are_results_on_device
|
||||
false, // is_big_triangle
|
||||
true, // is_async
|
||||
// false, // segments_reduction
|
||||
};
|
||||
|
||||
auto begin1 = std::chrono::high_resolution_clock::now();
|
||||
msm::msm<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
|
||||
cudaEvent_t msm_end_event;
|
||||
cudaEventCreate(&msm_end_event);
|
||||
auto end1 = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
|
||||
printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
if (precomp_factor > 1)
|
||||
msm::precompute_msm_points<test_affine, test_projective>(points_d, msm_size, config, precomp_points_d);
|
||||
|
||||
// warm up
|
||||
msm::msm<test_scalar, test_affine, test_projective>(
|
||||
scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// auto begin1 = std::chrono::high_resolution_clock::now();
|
||||
cudaEventRecord(start, stream);
|
||||
msm::msm<test_scalar, test_affine, test_projective>(
|
||||
scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
|
||||
cudaEventRecord(stop, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaEventElapsedTime(&msm_time, start, stop);
|
||||
// cudaEvent_t msm_end_event;
|
||||
// cudaEventCreate(&msm_end_event);
|
||||
// auto end1 = std::chrono::high_resolution_clock::now();
|
||||
// auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
|
||||
printf("msm time : %.3f ms.\n", msm_time);
|
||||
|
||||
// reference
|
||||
config.c = 16;
|
||||
config.precompute_factor = 1;
|
||||
config.is_big_triangle = true;
|
||||
config.are_results_on_device = false;
|
||||
cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
|
||||
std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1])
|
||||
<< std::endl;
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
msm::msm<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
|
||||
config.batch_size = 1;
|
||||
config.points_size = msm_size;
|
||||
// config.segments_reduction = false;
|
||||
for (int i = 0; i < batch_size; i++) {
|
||||
msm::msm<test_scalar, test_affine, test_projective>(
|
||||
scalars + i * msm_size, points_d + i * msm_size, msm_size, config, ref_d + i);
|
||||
}
|
||||
|
||||
// config.are_results_on_device = false;
|
||||
// std::cout << test_projective::to_affine(large_res[0]) << std::endl;
|
||||
// auto begin = std::chrono::high_resolution_clock::now();
|
||||
// msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
|
||||
// test_reduce_triangle(scalars);
|
||||
// test_reduce_rectangle(scalars);
|
||||
// test_reduce_single(scalars);
|
||||
// test_reduce_var(scalars);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
// auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
// printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaStreamDestroy(stream);
|
||||
|
||||
// std::cout << test_projective::to_affine(large_res[0]) << std::endl;
|
||||
|
||||
cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
|
||||
|
||||
// reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
|
||||
|
||||
// std::cout<<"final results batched large"<<std::endl;
|
||||
// bool success = true;
|
||||
// for (unsigned i = 0; i < batch_size; i++)
|
||||
// {
|
||||
// std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
|
||||
// if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
|
||||
// std::cout<<"good"<<std::endl;
|
||||
// }
|
||||
// else{
|
||||
// std::cout<<"miss"<<std::endl;
|
||||
// std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
|
||||
// success = false;
|
||||
// }
|
||||
// }
|
||||
// if (success){
|
||||
// std::cout<<"success!"<<std::endl;
|
||||
// }
|
||||
bool success = true;
|
||||
for (unsigned i = 0; i < batch_size; i++) {
|
||||
std::cout << test_projective::to_affine(res[i]) << std::endl;
|
||||
if (test_projective::to_affine(res[i]) == test_projective::to_affine(ref[i])) {
|
||||
std::cout << "good" << std::endl;
|
||||
} else {
|
||||
std::cout << "miss" << std::endl;
|
||||
std::cout << test_projective::to_affine(ref[i]) << std::endl;
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
if (success) { std::cout << "success!" << std::endl; }
|
||||
|
||||
// std::cout<<batched_large_res[0]<<std::endl;
|
||||
// std::cout<<batched_large_res[1]<<std::endl;
|
||||
@@ -239,4 +292,4 @@ int main()
|
||||
// std::cout<<pr<<std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -116,13 +116,13 @@ func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfi
|
||||
return scalars.AsUnsafePointer(), points.AsUnsafePointer(), results.AsUnsafePointer(), size, unsafe.Pointer(cfg)
|
||||
}
|
||||
|
||||
func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
|
||||
func PrecomputePointsCheck(points HostOrDeviceSlice, cfg *MSMConfig, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
|
||||
outputBasesLength, pointsLength := outputBases.Len(), points.Len()
|
||||
if outputBasesLength != pointsLength*int(precomputeFactor) {
|
||||
if outputBasesLength != pointsLength*int(cfg.PrecomputeFactor) {
|
||||
errorString := fmt.Sprintf(
|
||||
"Precompute factor is probably incorrect: expected %d but got %d",
|
||||
outputBasesLength/pointsLength,
|
||||
precomputeFactor,
|
||||
cfg.PrecomputeFactor,
|
||||
)
|
||||
panic(errorString)
|
||||
}
|
||||
@@ -131,5 +131,8 @@ func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outp
|
||||
points.(DeviceSlice).CheckDevice()
|
||||
}
|
||||
|
||||
cfg.pointsSize = int32(pointsLength)
|
||||
cfg.arePointsOnDevice = points.IsOnDevice()
|
||||
|
||||
return points.AsUnsafePointer(), outputBases.AsUnsafePointer()
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bls12_377_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
|
||||
cudaError_t bls12_377_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
|
||||
cudaError_t bls12_377_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: G2PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// G2PrecomputePoints should be used instead.
|
||||
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := G2GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bls12_377_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bls12_377_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
|
||||
cudaError_t bls12_377_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
|
||||
cudaError_t bls12_377_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// PrecomputePoints should be used instead.
|
||||
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bls12_377_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBaseG2(t *testing.T) {
|
||||
func TestPrecomputePointsG2(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase(t *testing.T) {
|
||||
func TestPrecomputePoints(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p icicleBls12_377.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = msm.Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[icicleBls12_377.Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bls12_381_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
|
||||
cudaError_t bls12_381_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
|
||||
cudaError_t bls12_381_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: G2PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// G2PrecomputePoints should be used instead.
|
||||
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := G2GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bls12_381_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bls12_381_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
|
||||
cudaError_t bls12_381_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
|
||||
cudaError_t bls12_381_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// PrecomputePoints should be used instead.
|
||||
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bls12_381_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBaseG2(t *testing.T) {
|
||||
func TestPrecomputePointsG2(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase(t *testing.T) {
|
||||
func TestPrecomputePoints(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p icicleBls12_381.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = msm.Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[icicleBls12_381.Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bn254_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
|
||||
cudaError_t bn254_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
|
||||
cudaError_t bn254_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: G2PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// G2PrecomputePoints should be used instead.
|
||||
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := G2GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bn254_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bn254_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
|
||||
cudaError_t bn254_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
|
||||
cudaError_t bn254_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// PrecomputePoints should be used instead.
|
||||
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bn254_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBaseG2(t *testing.T) {
|
||||
func TestPrecomputePointsG2(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase(t *testing.T) {
|
||||
func TestPrecomputePoints(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p icicleBn254.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = msm.Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[icicleBn254.Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bw6_761_g2_msm_cuda(const scalar_t* scalars,const g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
|
||||
cudaError_t bw6_761_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
|
||||
cudaError_t bw6_761_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: G2PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// G2PrecomputePoints should be used instead.
|
||||
func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := G2GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.g2_affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bw6_761_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t bw6_761_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
|
||||
cudaError_t bw6_761_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
|
||||
cudaError_t bw6_761_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// PrecomputePoints should be used instead.
|
||||
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.bw6_761_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -170,9 +170,11 @@ func TestMSMG2Batch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBaseG2(t *testing.T) {
|
||||
func TestPrecomputePointsG2(t *testing.T) {
|
||||
cfg := g2.G2GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -182,20 +184,18 @@ func TestPrecomputeBaseG2(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p g2.G2Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[g2.G2Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase(t *testing.T) {
|
||||
func TestPrecomputePoints(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p icicleBw6_761.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = msm.Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[icicleBw6_761.Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t grumpkin_msm_cuda(const scalar_t* scalars,const affine_t* points, int count, MSMConfig* config, projective_t* out);
|
||||
cudaError_t grumpkin_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
|
||||
cudaError_t grumpkin_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// PrecomputePoints should be used instead.
|
||||
func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.grumpkin_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -64,9 +64,11 @@ func TestMSMBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase(t *testing.T) {
|
||||
func TestPrecomputePoints(t *testing.T) {
|
||||
cfg := msm.GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -76,20 +78,18 @@ func TestPrecomputeBase(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p icicleGrumpkin.Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = msm.Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[icicleGrumpkin.Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -28,8 +28,13 @@ func {{.CurvePrefix}}Msm(scalars core.HostOrDeviceSlice, points core.HostOrDevic
|
||||
return err
|
||||
}
|
||||
|
||||
// Deprecated: {{.CurvePrefix}}PrecomputeBases exists for backward compatibility.
|
||||
// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
|
||||
// {{.CurvePrefix}}PrecomputePoints should be used instead.
|
||||
func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
|
||||
cfg := {{.CurvePrefix}}GetDefaultMSMConfig()
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)
|
||||
|
||||
cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
|
||||
cPointsLen := (C.int)(points.Len())
|
||||
@@ -43,3 +48,16 @@ func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFa
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
func {{.CurvePrefix}}PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
|
||||
pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
|
||||
|
||||
cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
|
||||
cMsmSize := (C.int)(msmSize)
|
||||
cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
|
||||
cOutputBases := (*C.{{toCName .CurvePrefix}}affine_t)(outputBasesPointer)
|
||||
|
||||
__ret := C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
|
||||
err := (cr.CudaError)(__ret)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;
|
||||
|
||||
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_msm_cuda(const scalar_t* scalars,const {{toCName .CurvePrefix}}affine_t* points, int count, MSMConfig* config, {{toCName .CurvePrefix}}projective_t* out);
|
||||
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_bases_cuda({{toCName .CurvePrefix}}affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, {{toCName .CurvePrefix}}affine_t* out);
|
||||
cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda({{toCName .CurvePrefix}}affine_t* points, int msm_size, MSMConfig* config, {{toCName .CurvePrefix}}affine_t* out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -251,9 +251,11 @@ func TestMSM{{.CurvePrefix}}Batch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {
|
||||
func TestPrecomputePoints{{.CurvePrefix}}(t *testing.T) {
|
||||
cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
|
||||
const precomputeFactor = 8
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
|
||||
for _, power := range []int{10, 16} {
|
||||
for _, batchSize := range []int{1, 3, 16} {
|
||||
size := 1 << power
|
||||
@@ -263,20 +265,18 @@ func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {
|
||||
|
||||
var precomputeOut core.DeviceSlice
|
||||
_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")
|
||||
|
||||
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
|
||||
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputePoints(points, size, &cfg, precomputeOut)
|
||||
assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")
|
||||
|
||||
var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
|
||||
var out core.DeviceSlice
|
||||
_, e = out.Malloc(batchSize*p.Size(), p.Size())
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
cfg.PrecomputeFactor = precomputeFactor
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")
|
||||
|
||||
e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, precomputeOut, &cfg, out)
|
||||
assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
|
||||
assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
|
||||
outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], batchSize)
|
||||
outHost.CopyFromDevice(&out)
|
||||
out.Free()
|
||||
|
||||
@@ -105,6 +105,13 @@ pub trait MSM<C: Curve> {
|
||||
ctx: &DeviceContext,
|
||||
output_bases: &mut DeviceSlice<Affine<C>>,
|
||||
) -> IcicleResult<()>;
|
||||
|
||||
fn precompute_points_unchecked(
|
||||
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
|
||||
msm_size: i32,
|
||||
cfg: &MSMConfig,
|
||||
output_bases: &mut DeviceSlice<Affine<C>>,
|
||||
) -> IcicleResult<()>;
|
||||
}
|
||||
|
||||
/// Computes the multi-scalar multiplication, or MSM: `s1*P1 + s2*P2 + ... + sn*Pn`, or a batch of several MSMs.
|
||||
@@ -188,7 +195,7 @@ pub fn msm<C: Curve + MSM<C>>(
|
||||
/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
|
||||
/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
|
||||
///
|
||||
/// * `bases` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
|
||||
/// * `points` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
|
||||
///
|
||||
/// * `precompute_factor` - The number of total precomputed points for each base (including the base itself).
|
||||
///
|
||||
@@ -201,6 +208,7 @@ pub fn msm<C: Curve + MSM<C>>(
|
||||
/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
|
||||
///
|
||||
/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
|
||||
#[deprecated(since = "2.5.0", note = "Please use `precompute_points` instead")]
|
||||
pub fn precompute_bases<C: Curve + MSM<C>>(
|
||||
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
|
||||
precompute_factor: i32,
|
||||
@@ -220,6 +228,55 @@ pub fn precompute_bases<C: Curve + MSM<C>>(
|
||||
C::precompute_bases_unchecked(points, precompute_factor, _c, ctx, output_bases)
|
||||
}
|
||||
|
||||
/// A function that precomputes MSM bases by extending them with their shifted copies.
|
||||
/// e.g.:
|
||||
/// Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
|
||||
/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
|
||||
/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
|
||||
///
|
||||
/// * `points` - Base points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
|
||||
///
|
||||
/// * `msm_size` - The size of a single MSM to compute.
|
||||
///
|
||||
/// * `cfg` - config used to specify extra arguments of the MSM. NOTE: You should update the precompute_factor
|
||||
/// and potentially the c value inside this config. This config should be the same config used in msm.
|
||||
///
|
||||
/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
|
||||
///
|
||||
/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
|
||||
pub fn precompute_points<C: Curve + MSM<C>>(
|
||||
points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
|
||||
msm_size: i32,
|
||||
cfg: &MSMConfig,
|
||||
output_bases: &mut DeviceSlice<Affine<C>>,
|
||||
) -> IcicleResult<()> {
|
||||
let precompute_factor = cfg.precompute_factor;
|
||||
assert_eq!(
|
||||
output_bases.len(),
|
||||
points.len() * (precompute_factor as usize),
|
||||
"Precompute factor is probably incorrect: expected {} but got {}",
|
||||
output_bases.len() / points.len(),
|
||||
precompute_factor
|
||||
);
|
||||
assert!(output_bases.is_on_device());
|
||||
|
||||
let ctx_device_id = cfg
|
||||
.ctx
|
||||
.device_id;
|
||||
if let Some(device_id) = points.device_id() {
|
||||
assert_eq!(
|
||||
device_id, ctx_device_id,
|
||||
"Device ids in points and context are different"
|
||||
);
|
||||
}
|
||||
check_device(ctx_device_id);
|
||||
let mut local_cfg = cfg.clone();
|
||||
local_cfg.points_size = points.len() as i32;
|
||||
local_cfg.are_points_on_device = points.is_on_device();
|
||||
|
||||
C::precompute_points_unchecked(points, msm_size, &local_cfg, output_bases)
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! impl_msm {
|
||||
(
|
||||
@@ -250,6 +307,14 @@ macro_rules! impl_msm {
|
||||
ctx: &DeviceContext,
|
||||
output_bases: *mut Affine<$curve>,
|
||||
) -> CudaError;
|
||||
|
||||
#[link_name = concat!($curve_prefix, "_precompute_msm_points_cuda")]
|
||||
pub(crate) fn precompute_points_cuda(
|
||||
points: *const Affine<$curve>,
|
||||
msm_size: i32,
|
||||
cfg: &MSMConfig,
|
||||
output_bases: *mut Affine<$curve>,
|
||||
) -> CudaError;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -292,6 +357,23 @@ macro_rules! impl_msm {
|
||||
.wrap()
|
||||
}
|
||||
}
|
||||
|
||||
fn precompute_points_unchecked(
|
||||
points: &(impl HostOrDeviceSlice<Affine<$curve>> + ?Sized),
|
||||
msm_size: i32,
|
||||
cfg: &MSMConfig,
|
||||
output_bases: &mut DeviceSlice<Affine<$curve>>,
|
||||
) -> IcicleResult<()> {
|
||||
unsafe {
|
||||
$curve_prefix_indent::precompute_points_cuda(
|
||||
points.as_ptr(),
|
||||
msm_size,
|
||||
cfg,
|
||||
output_bases.as_mut_ptr(),
|
||||
)
|
||||
.wrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -363,7 +445,7 @@ macro_rules! impl_msm_bench {
|
||||
group.sampling_mode(SamplingMode::Flat);
|
||||
group.sample_size(10);
|
||||
|
||||
use icicle_core::msm::precompute_bases;
|
||||
use icicle_core::msm::precompute_points;
|
||||
use icicle_core::msm::tests::generate_random_affine_points_with_zeroes;
|
||||
use icicle_cuda_runtime::stream::CudaStream;
|
||||
|
||||
@@ -396,11 +478,11 @@ macro_rules! impl_msm_bench {
|
||||
let points = generate_random_affine_points_with_zeroes(test_size, 10);
|
||||
for precompute_factor in [1, 4, 8] {
|
||||
let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
|
||||
precompute_bases(
|
||||
cfg.precompute_factor = precompute_factor as i32;
|
||||
precompute_points(
|
||||
HostSlice::from_slice(&points),
|
||||
precompute_factor as i32,
|
||||
0,
|
||||
&cfg.ctx,
|
||||
test_size as i32,
|
||||
&cfg,
|
||||
&mut precomputed_points_d,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -419,7 +501,6 @@ macro_rules! impl_msm_bench {
|
||||
let scalars_h = HostSlice::from_slice(&scalars);
|
||||
|
||||
let mut msm_results = DeviceVec::<Projective<C>>::cuda_malloc(batch_size).unwrap();
|
||||
cfg.precompute_factor = precompute_factor as i32;
|
||||
|
||||
let bench_descr = format!(
|
||||
" {} x {} with precomp = {:?}",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::curve::{Affine, Curve, Projective};
|
||||
use crate::msm::{msm, precompute_bases, MSMConfig, MSM};
|
||||
use crate::msm::{msm, precompute_points, MSMConfig, MSM};
|
||||
use crate::traits::{FieldImpl, GenerateRandom};
|
||||
use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
|
||||
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
|
||||
@@ -120,16 +120,16 @@ where
|
||||
cfg.is_async = true;
|
||||
cfg.large_bucket_factor = 5;
|
||||
cfg.c = 4;
|
||||
let precompute_factor = 8;
|
||||
warmup(&stream).unwrap();
|
||||
for test_size in test_sizes {
|
||||
let precompute_factor = 8;
|
||||
let points = generate_random_affine_points_with_zeroes(test_size, 10);
|
||||
let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
|
||||
precompute_bases(
|
||||
cfg.precompute_factor = precompute_factor as i32;
|
||||
precompute_points(
|
||||
HostSlice::from_slice(&points),
|
||||
precompute_factor as i32,
|
||||
0,
|
||||
&cfg.ctx,
|
||||
test_size as i32,
|
||||
&cfg,
|
||||
&mut precomputed_points_d,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Reference in New Issue
Block a user