Merge branch 'dev' into temp/stas/muli-gpu-example

This commit is contained in:
Stas
2024-02-20 19:07:48 -06:00
committed by GitHub
15 changed files with 374 additions and 273 deletions

View File

@@ -16,10 +16,11 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
// Include NTT template
#include "appUtils/ntt/ntt.cu"
using namespace curve_config;
using namespace ntt;
// Configure NTT
ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
NTTConfig<S> config=DefaultNTTConfig<S>();
// Call NTT
ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
```
## Running the example
@@ -28,5 +29,10 @@ ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
- compile with `./compile.sh`
- run with `./run.sh`
## What's in the example
1. Define the size of the example
2. Initialize input
3. Run Radix2 NTT
4. Run MixedRadix NTT
5. Validate the data output

View File

@@ -7,6 +7,7 @@
#include "appUtils/ntt/ntt.cu"
#include "appUtils/ntt/kernel_ntt.cu"
using namespace curve_config;
using namespace ntt;
// Operate on scalars
typedef scalar_t S;
@@ -58,6 +59,11 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
return nof_errors;
}
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
@@ -78,24 +84,30 @@ int main(int argc, char* argv[])
output = (E*)malloc(sizeof(E) * batch_size);
std::cout << "Running NTT with on-host data" << std::endl;
cudaStream_t stream;
cudaStreamCreate(&stream);
// Create a device context
auto ctx = device_context::get_default_device_context();
// the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
ntt::InitDomain(rou, ctx);
const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
InitDomain(basic_root, ctx);
// Create an NTTConfig instance
ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
NTTConfig<S> config = DefaultNTTConfig<S>();
config.ntt_algorithm = NttAlgorithm::MixedRadix;
config.batch_size = nof_ntts;
config.ctx.stream = stream;
auto begin0 = std::chrono::high_resolution_clock::now();
cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
auto end0 = std::chrono::high_resolution_clock::now();
auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
START_TIMER(MixedRadix);
cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
END_TIMER(MixedRadix, "MixedRadix NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, nof_ntts, output);
cudaStreamDestroy(stream);
config.ntt_algorithm = NttAlgorithm::Radix2;
START_TIMER(Radix2);
err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
END_TIMER(Radix2, "Radix2 NTT");
std::cout << "Validating output" << std::endl;
validate_output(ntt_size, nof_ntts, output);
std::cout << "Cleaning-up memory" << std::endl;
free(input);
free(output);
return 0;

View File

@@ -1,4 +1,4 @@
test_msm:
mkdir -p work
nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu
nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
work/test_msm

View File

@@ -30,6 +30,53 @@ namespace msm {
unsigned get_optimal_c(int bitsize) { return max((unsigned)ceil(log2(bitsize)) - 4, 1U); }
template <typename E>
__global__ void normalize_kernel(E* inout, E factor, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) inout[tid] = (inout[tid] + factor - 1) / factor;
}
// a kernel that writes to bucket_indices which enables large bucket accumulation to happen afterwards.
// specifically we map thread indices to buckets which said threads will handle in accumulation.
template <typename P>
__global__ void initialize_large_bucket_indices(
unsigned* sorted_bucket_sizes_sum,
unsigned nof_pts_per_thread,
unsigned nof_large_buckets,
// log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute))
unsigned log_nof_large_buckets,
unsigned* bucket_indices)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= nof_large_buckets) { return; }
unsigned start = (sorted_bucket_sizes_sum[tid] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid;
unsigned end = (sorted_bucket_sizes_sum[tid + 1] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid + 1;
for (unsigned i = start; i < end; i++) {
// this just concatenates two pieces of data - large bucket index and (i - start)
bucket_indices[i] = tid | ((i - start) << log_nof_large_buckets);
}
}
// this function provides a single step of reduction across buckets sizes of
// which are given by large_bucket_sizes pointer
template <typename P>
__global__ void sum_reduction_variable_size_kernel(
P* v,
unsigned* bucket_sizes_sum,
unsigned* bucket_sizes,
unsigned* large_bucket_thread_indices,
unsigned num_of_threads)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_of_threads) { return; }
unsigned large_bucket_tid = large_bucket_thread_indices[tid];
unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
unsigned large_bucket_size = bucket_sizes[large_bucket_tid];
if (segment_ind < (large_bucket_size >> 1)) { v[tid] = v[tid] + v[tid + ((large_bucket_size + 1) >> 1)]; }
}
template <typename P>
__global__ void single_stage_multi_reduction_kernel(
const P* v,
@@ -37,23 +84,27 @@ namespace msm {
unsigned block_size,
unsigned write_stride,
unsigned write_phase,
unsigned padding,
unsigned step,
unsigned num_of_threads)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_of_threads) { return; }
// we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
// for write_phase==1, the read pattern is different so we don't skip over anything.
const int shifted_tid = write_phase ? tid : tid + (tid + step) / step;
const int jump = block_size / 2;
const int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
const int block_id = tid_p / jump;
const int block_tid = tid_p % jump;
const unsigned read_ind = block_size * block_id + block_tid;
const unsigned write_ind = tid;
const int block_id = shifted_tid / jump;
// here the reason for shifting is the same as for shifted_tid but we skip over entire blocks which happens
// only for write_phase=1 because of its read pattern.
const int shifted_block_id = write_phase ? block_id + (block_id + step) / step : block_id;
const int block_tid = shifted_tid % jump;
const unsigned read_ind = block_size * shifted_block_id + block_tid;
const unsigned write_ind = jump * shifted_block_id + block_tid;
const unsigned v_r_key =
write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
: write_ind;
v_r[v_r_key] = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
: v[read_ind] + v[read_ind + jump];
v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
}
// this kernel performs single scalar multiplication
@@ -135,8 +186,8 @@ namespace msm {
buckets_indices[current_index] =
(msm_index << (c + bm_bitsize)) | (bm << c) |
bucket_index; // the bucket module number and the msm number are appended at the msbs
if (scalar == S::zero() || bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped
point_indices[current_index] = tid % points_size; // the point index is saved for later
if (bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped
point_indices[current_index] = tid % points_size; // the point index is saved for later
#endif
}
}
@@ -158,19 +209,6 @@ namespace msm {
if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; }
}
template <typename S>
__global__ void
find_max_size(unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, unsigned* largest_bucket_size)
{
for (int i = 0;; i++) {
if (single_bucket_indices[i] & ((1 << c) - 1)) {
largest_bucket_size[0] = bucket_sizes[i];
largest_bucket_size[1] = i;
break;
}
}
}
// this kernel adds up the points in each bucket
template <typename P, typename A>
__global__ void accumulate_buckets_kernel(
@@ -188,9 +226,6 @@ namespace msm {
// constexpr unsigned sign_mask = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_buckets_to_compute) return;
if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) {
return; // skip zero buckets
}
#ifdef SIGNED_DIG // todo - fix
const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c;
@@ -198,7 +233,8 @@ namespace msm {
msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1));
#else
unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
const unsigned single_bucket_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
unsigned bucket_index = msm_index * nof_buckets + single_bucket_index;
#endif
const unsigned bucket_offset = bucket_offsets[tid];
const unsigned bucket_size = bucket_sizes[tid];
@@ -215,7 +251,8 @@ namespace msm {
#else
A point = points[point_ind];
#endif
bucket = i ? bucket + point : P::from_affine(point);
bucket =
i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point));
}
buckets[bucket_index] = bucket;
}
@@ -225,43 +262,40 @@ namespace msm {
P* __restrict__ buckets,
unsigned* __restrict__ bucket_offsets,
unsigned* __restrict__ bucket_sizes,
unsigned* __restrict__ single_bucket_indices,
const unsigned* __restrict__ point_indices,
unsigned* __restrict__ large_bucket_thread_indices,
unsigned* __restrict__ point_indices,
A* __restrict__ points,
const unsigned nof_buckets,
const unsigned nof_buckets_to_compute,
const unsigned msm_idx_shift,
const unsigned c,
const unsigned threads_per_bucket,
const unsigned max_run_length)
const int points_per_thread,
// log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute))
const unsigned log_nof_buckets_to_compute,
const unsigned nof_threads)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
unsigned large_bucket_index = tid / threads_per_bucket;
unsigned bucket_segment_index = tid % threads_per_bucket;
if (tid >= nof_buckets_to_compute * threads_per_bucket) { return; }
if ((single_bucket_indices[large_bucket_index] & ((1 << c) - 1)) == 0) { // don't need
return; // skip zero buckets
}
unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index;
const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index * max_run_length;
const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index * max_run_length
? bucket_sizes[large_bucket_index] - bucket_segment_index * max_run_length
: 0;
if (tid >= nof_threads) return;
int bucket_segment_index = large_bucket_thread_indices[tid] >> log_nof_buckets_to_compute;
large_bucket_thread_indices[tid] &= ((1 << log_nof_buckets_to_compute) - 1);
int bucket_ind = large_bucket_thread_indices[tid];
const unsigned bucket_offset = bucket_offsets[bucket_ind] + bucket_segment_index * points_per_thread;
const unsigned bucket_size = max(0, (int)bucket_sizes[bucket_ind] - bucket_segment_index * points_per_thread);
P bucket;
unsigned run_length = min(bucket_size, max_run_length);
unsigned run_length = min(bucket_size, points_per_thread);
for (unsigned i = 0; i < run_length;
i++) { // add the relevant points starting from the relevant offset up to the bucket size
unsigned point_ind = point_indices[bucket_offset + i];
A point = points[point_ind];
bucket = i ? bucket + point : P::from_affine(point); // init empty buckets
bucket =
i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point));
}
buckets[write_bucket_index] = run_length ? bucket : P::zero();
buckets[tid] = run_length ? bucket : P::zero();
}
template <typename P>
__global__ void distribute_large_buckets_kernel(
const P* large_buckets,
P* buckets,
const unsigned* sorted_bucket_sizes_sum,
const unsigned* single_bucket_indices,
const unsigned size,
const unsigned nof_buckets,
@@ -272,7 +306,8 @@ namespace msm {
unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
buckets[bucket_index] = large_buckets[tid];
unsigned large_bucket_index = sorted_bucket_sizes_sum[tid] + tid;
buckets[bucket_index] = large_buckets[large_bucket_index];
}
// this kernel sums the entire bucket module
@@ -370,7 +405,6 @@ namespace msm {
}
S* d_scalars;
A* d_points;
if (!are_scalars_on_device) {
// copy scalars to gpu
CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * nof_scalars, stream));
@@ -378,15 +412,7 @@ namespace msm {
} else {
d_scalars = scalars;
}
cudaStream_t stream_points;
if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points));
if (!are_points_on_device) {
// copy points to gpu
CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
} else {
d_points = points;
}
if (are_scalars_montgomery_form) {
if (are_scalars_on_device) {
S* d_mont_scalars;
@@ -396,6 +422,99 @@ namespace msm {
} else
CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_scalars));
}
unsigned nof_bms_per_msm = (bitsize + c - 1) / c;
unsigned* bucket_indices;
unsigned* point_indices;
unsigned* sorted_bucket_indices;
unsigned* sorted_point_indices;
CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm));
// split scalars into digits
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bucket_indices, point_indices, d_scalars, nof_scalars, nof_points, single_msm_size, nof_bms_per_msm, bm_bitsize,
c);
// sort indices - the indices are sorted from smallest to largest in order to group together the points that
// belong to each bucket
unsigned* sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream));
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(point_indices, stream));
// compute number of bucket modules and number of buckets in each module
unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size;
#ifdef SIGNED_DIG
const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits
#else
// minus nof_bms_per_msm because zero bucket is not included in each bucket module
const unsigned nof_buckets = (nof_bms_per_msm << c) - nof_bms_per_msm;
#endif
const unsigned total_nof_buckets = nof_buckets * batch_size;
// find bucket_sizes
unsigned* single_bucket_indices;
unsigned* bucket_sizes;
unsigned* nof_buckets_to_compute;
// +1 here and in other places because there still is zero index corresponding to zero bucket at this point
CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * (total_nof_buckets + 1), stream));
CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * (total_nof_buckets + 1), stream));
CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream));
unsigned* encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_indices, stream));
// get offsets - where does each new bucket begin
unsigned* bucket_offsets;
CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * (total_nof_buckets + 1), stream));
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream));
CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream));
CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream));
A* d_points;
cudaStream_t stream_points;
if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points));
if (!are_points_on_device) {
// copy points to gpu
CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points));
CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
} else {
d_points = points;
}
if (are_points_montgomery_form) {
if (are_points_on_device) {
A* d_mont_points;
@@ -412,111 +531,26 @@ namespace msm {
}
P* buckets;
// compute number of bucket modules and number of buckets in each module
unsigned nof_bms_per_msm = (bitsize + c - 1) / c;
unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm));
unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size;
#ifdef SIGNED_DIG
const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits
#else
const unsigned nof_buckets = nof_bms_per_msm << c;
#endif
const unsigned total_nof_buckets = nof_buckets * batch_size;
CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream));
CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * (total_nof_buckets + nof_bms_in_batch), stream));
// launch the bucket initialization kernel with maximum threads
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);
unsigned* bucket_indices;
unsigned* point_indices;
CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream));
CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream));
// split scalars into digits
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bucket_indices + nof_scalars, point_indices + nof_scalars, d_scalars, nof_scalars, nof_points, single_msm_size,
nof_bms_per_msm, bm_bitsize, c); //+nof_scalars - leaving the first bm free for the out of place sort later
NUM_BLOCKS = (total_nof_buckets + nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets + nof_bms_in_batch);
// sort indices - the indices are sorted from smallest to largest in order to group together the points that
// belong to each bucket
unsigned* sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + nof_scalars, bucket_indices,
point_indices + nof_scalars, point_indices, nof_scalars, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream));
for (unsigned i = 0; i < nof_bms_per_msm; i++) {
unsigned offset_out = i * nof_scalars;
unsigned offset_in = offset_out + nof_scalars;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
// more info
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in,
bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, nof_scalars, 0,
sizeof(unsigned) * 8, stream));
}
CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream));
// find bucket_sizes
unsigned* single_bucket_indices;
unsigned* bucket_sizes;
unsigned* nof_buckets_to_compute;
CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream));
CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream));
CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream));
unsigned* encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream));
// get offsets - where does each new bucket begin
unsigned* bucket_offsets;
CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream));
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream));
CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream));
CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream));
// removing zero bucket, if it exists
unsigned smallest_bucket_index;
CHK_IF_RETURN(cudaMemcpyAsync(
&smallest_bucket_index, single_bucket_indices, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
// maybe zero bucket is empty after all? in this case zero_bucket_offset is set to 0
unsigned zero_bucket_offset = (smallest_bucket_index == 0) ? 1 : 0;
// sort by bucket sizes
unsigned h_nof_buckets_to_compute;
CHK_IF_RETURN(cudaMemcpyAsync(
&h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
// if all points are 0 just return point 0
if (h_nof_buckets_to_compute == 0) {
if (!are_results_on_device) {
for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) {
final_result[batch_element] = P::zero();
}
} else {
P* h_final_result = (P*)malloc(sizeof(P) * batch_size);
for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) {
h_final_result[batch_element] = P::zero();
}
CHK_IF_RETURN(
cudaMemcpyAsync(final_result, h_final_result, sizeof(P) * batch_size, cudaMemcpyHostToDevice, stream));
}
return CHK_LAST();
}
CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream));
h_nof_buckets_to_compute -= zero_bucket_offset;
unsigned* sorted_bucket_sizes;
CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned) * h_nof_buckets_to_compute, stream));
@@ -525,13 +559,16 @@ namespace msm {
unsigned* sort_offsets_temp_storage{};
size_t sort_offsets_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0,
sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0,
sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaFreeAsync(sort_offsets_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream));
unsigned* sorted_single_bucket_indices;
CHK_IF_RETURN(
@@ -539,92 +576,128 @@ namespace msm {
unsigned* sort_single_temp_storage{};
size_t sort_single_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes,
single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8,
stream));
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices,
h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream));
CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes,
single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8,
stream));
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices,
h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
CHK_IF_RETURN(cudaFreeAsync(sort_single_temp_storage, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream));
CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream));
// find large buckets
unsigned avarage_size = single_msm_size / (1 << c);
unsigned bucket_th = large_bucket_factor * avarage_size;
unsigned average_bucket_size = single_msm_size / (1 << c);
// how large a bucket must be to qualify as a "large bucket"
unsigned bucket_th = large_bucket_factor * average_bucket_size;
unsigned* nof_large_buckets;
CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream));
CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned)));
unsigned TOTAL_THREADS = 129000; // todo - device dependent
unsigned TOTAL_THREADS = 129000; // TODO: device dependent
unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
NUM_THREADS = min(1 << 5, cutoff_nof_runs);
NUM_THREADS = 1 << 5;
NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
find_cutoff_kernel<S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets);
if (h_nof_buckets_to_compute > 0 && bucket_th > 0)
find_cutoff_kernel<S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets);
unsigned h_nof_large_buckets;
CHK_IF_RETURN(
cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
unsigned* max_res;
CHK_IF_RETURN(cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream));
find_max_size<S><<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, max_res);
unsigned h_max_res[2];
CHK_IF_RETURN(cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream));
unsigned h_largest_bucket_size = h_max_res[0];
unsigned h_nof_zero_large_buckets = h_max_res[1];
unsigned large_buckets_to_compute =
h_nof_large_buckets > h_nof_zero_large_buckets ? h_nof_large_buckets - h_nof_zero_large_buckets : 0;
CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream));
if (!are_points_on_device || are_points_montgomery_form) {
// by this point, points need to be already uploaded and un-Montgomeried
CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_points_uploaded));
CHK_IF_RETURN(cudaEventDestroy(event_points_uploaded));
CHK_IF_RETURN(cudaStreamDestroy(stream_points));
}
cudaStream_t stream_large_buckets;
cudaEvent_t event_large_buckets_accumulated;
P* large_buckets;
if (large_buckets_to_compute > 0 && bucket_th > 0) {
// this is where handling of large buckets happens (if there are any)
if (h_nof_large_buckets > 0 && bucket_th > 0) {
CHK_IF_RETURN(cudaStreamCreate(&stream_large_buckets));
CHK_IF_RETURN(cudaEventCreateWithFlags(&event_large_buckets_accumulated, cudaEventDisableTiming));
unsigned threads_per_bucket =
1 << (unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); // global param
unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
unsigned total_large_buckets_size = large_buckets_to_compute * threads_per_bucket;
CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * total_large_buckets_size, stream));
unsigned* sorted_bucket_sizes_sum;
CHK_IF_RETURN(cudaMallocAsync(
&sorted_bucket_sizes_sum, sizeof(unsigned) * (h_nof_large_buckets + 1), stream_large_buckets));
CHK_IF_RETURN(cudaMemsetAsync(sorted_bucket_sizes_sum, 0, sizeof(unsigned), stream_large_buckets));
unsigned* large_bucket_temp_storage{};
size_t large_bucket_temp_storage_bytes = 0;
CHK_IF_RETURN(cub::DeviceScan::InclusiveSum(
large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1,
h_nof_large_buckets, stream_large_buckets));
CHK_IF_RETURN(
cudaMallocAsync(&large_bucket_temp_storage, large_bucket_temp_storage_bytes, stream_large_buckets));
CHK_IF_RETURN(cub::DeviceScan::InclusiveSum(
large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1,
h_nof_large_buckets, stream_large_buckets));
CHK_IF_RETURN(cudaFreeAsync(large_bucket_temp_storage, stream_large_buckets));
unsigned h_nof_pts_in_large_buckets;
CHK_IF_RETURN(cudaMemcpyAsync(
&h_nof_pts_in_large_buckets, sorted_bucket_sizes_sum + h_nof_large_buckets, sizeof(unsigned),
cudaMemcpyDeviceToHost, stream_large_buckets));
unsigned h_largest_bucket;
CHK_IF_RETURN(cudaMemcpyAsync(
&h_largest_bucket, sorted_bucket_sizes, sizeof(unsigned), cudaMemcpyDeviceToHost, stream_large_buckets));
NUM_THREADS = min(1 << 8, total_large_buckets_size);
NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
// the number of threads for large buckets has an extra h_nof_large_buckets term to account for bucket sizes
// unevenly divisible by average_bucket_size. there are similar corrections elsewhere when accessing large
// buckets
unsigned large_buckets_nof_threads =
(h_nof_pts_in_large_buckets + average_bucket_size - 1) / average_bucket_size + h_nof_large_buckets;
unsigned log_nof_large_buckets = (unsigned)ceil(log2(h_nof_large_buckets));
unsigned* large_bucket_indices;
CHK_IF_RETURN(cudaMallocAsync(&large_bucket_indices, sizeof(unsigned) * large_buckets_nof_threads, stream));
NUM_THREADS = min(1 << 8, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_large_bucket_indices<P><<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets, log_nof_large_buckets,
large_bucket_indices);
P* large_buckets;
CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * large_buckets_nof_threads, stream_large_buckets));
NUM_THREADS = min(1 << 8, large_buckets_nof_threads);
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, sorted_bucket_offsets + h_nof_zero_large_buckets,
sorted_bucket_sizes + h_nof_zero_large_buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets,
point_indices, d_points, nof_buckets, large_buckets_to_compute, c + bm_bitsize, c, threads_per_bucket,
max_bucket_size_run_length);
large_buckets, sorted_bucket_offsets, sorted_bucket_sizes, large_bucket_indices, sorted_point_indices,
d_points, h_nof_large_buckets, c, average_bucket_size, log_nof_large_buckets, large_buckets_nof_threads);
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
// normalization is needed to update buckets sizes and offsets due to reduction that already took place
normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets);
// reduce
for (int s = total_large_buckets_size >> 1; s > large_buckets_to_compute - 1; s >>= 1) {
NUM_THREADS = min(MAX_TH, s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, large_buckets, s * 2, 0, 0, 0, s);
for (int s = h_largest_bucket; s > 1; s = ((s + 1) >> 1)) {
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
sorted_bucket_sizes, s == h_largest_bucket ? average_bucket_size : 2, h_nof_large_buckets);
NUM_THREADS = min(MAX_TH, large_buckets_nof_threads);
NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
sum_reduction_variable_size_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, sorted_bucket_sizes_sum, sorted_bucket_sizes, large_bucket_indices,
large_buckets_nof_threads);
}
CHK_IF_RETURN(cudaFreeAsync(large_bucket_indices, stream_large_buckets));
// distribute
NUM_THREADS = min(MAX_TH, large_buckets_to_compute);
NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS;
NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
large_buckets, buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, large_buckets_to_compute,
nof_buckets, c + bm_bitsize);
large_buckets, buckets, sorted_bucket_sizes_sum, sorted_single_bucket_indices, h_nof_large_buckets,
nof_buckets + nof_bms_per_msm, c + bm_bitsize);
CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream_large_buckets));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes_sum, stream_large_buckets));
CHK_IF_RETURN(cudaEventRecord(event_large_buckets_accumulated, stream_large_buckets));
CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets));
} else {
h_nof_large_buckets = 0;
}
// launch the accumulation kernel with maximum threads
@@ -633,13 +706,18 @@ namespace msm {
NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets,
sorted_single_bucket_indices + h_nof_large_buckets, point_indices, d_points, nof_buckets,
h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
sorted_single_bucket_indices + h_nof_large_buckets, sorted_point_indices, d_points,
nof_buckets + nof_bms_per_msm, h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
}
if (large_buckets_to_compute > 0 && bucket_th > 0)
CHK_IF_RETURN(cudaFreeAsync(sorted_point_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream));
if (h_nof_large_buckets > 0 && bucket_th > 0) {
// all the large buckets need to be accumulated before the final summation
CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_large_buckets_accumulated));
CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets));
}
#ifdef SSM_SUM
// sum each bucket
@@ -676,7 +754,7 @@ namespace msm {
unsigned source_bits_count = c;
// bool odd_source_c = source_bits_count % 2;
unsigned source_windows_count = nof_bms_per_msm;
unsigned source_buckets_count = nof_buckets;
unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
unsigned target_windows_count = 0;
P* source_buckets = buckets;
buckets = nullptr;
@@ -698,20 +776,19 @@ namespace msm {
for (unsigned j = 0; j < target_bits_count; j++) {
const bool is_first_iter = (j == 0);
const bool is_last_iter = (j == target_bits_count - 1);
unsigned nof_threads = (source_buckets_count >> (1 + j)) * batch_size;
unsigned nof_threads =
(((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
0 /*=padding*/, nof_threads);
(1 << target_bits_count) - 1, nof_threads);
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
0 /*=padding*/, nof_threads);
(1 << target_bits_count) - 1, nof_threads);
}
}
if (target_bits_count == 1) {
@@ -761,24 +838,10 @@ namespace msm {
cudaMemcpyAsync(final_result, d_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream));
// free memory
if (!are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream));
if (!are_points_on_device) CHK_IF_RETURN(cudaFreeAsync(d_points, stream));
if (!are_scalars_on_device || are_scalars_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream));
if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_points, stream));
if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream));
CHK_IF_RETURN(cudaFreeAsync(buckets, stream));
#ifndef PHASE1_TEST
CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(point_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream));
CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream));
CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream));
#endif
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream));
CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream));
CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream));
CHK_IF_RETURN(cudaFreeAsync(max_res, stream));
if (large_buckets_to_compute > 0 && bucket_th > 0) CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream));
if (!is_async) CHK_IF_RETURN(cudaStreamSynchronize(stream));

View File

@@ -196,7 +196,7 @@ int main()
printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
config.is_big_triangle = true;
config.are_results_on_device = false;
// std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
std::cout << test_projective::to_affine(large_res[0]) << std::endl;
auto begin = std::chrono::high_resolution_clock::now();
msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
// test_reduce_triangle(scalars);

View File

@@ -121,7 +121,8 @@ namespace poseidon {
*/
// Stas: I have an issue with the number of argumnets
template <typename S>
cudaError_t init_optimized_poseidon_constants(device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
cudaError_t
init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
/**
* Compute the poseidon hash over a sequence of preimages.

View File

@@ -11,6 +11,8 @@ public:
static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }
static HOST_DEVICE_INLINE Affine ToMontgomery(const Affine& point)
{
return {FF::ToMontgomery(point.x), FF::ToMontgomery(point.y)};

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-core"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = ["Ingonyama"]
description = "A library for GPU ZK acceleration by Ingonyama"

View File

@@ -5,6 +5,8 @@ use crate::traits::{FieldImpl, MontgomeryConvertible};
use ark_ec::models::CurveConfig as ArkCurveConfig;
#[cfg(feature = "arkworks")]
use ark_ec::short_weierstrass::{Affine as ArkAffine, Projective as ArkProjective, SWCurveConfig};
#[cfg(feature = "arkworks")]
use ark_ec::AffineRepr;
use icicle_cuda_runtime::error::CudaError;
use icicle_cuda_runtime::memory::HostOrDeviceSlice;
use std::fmt::Debug;
@@ -147,13 +149,17 @@ where
type ArkEquivalent = ArkAffine<C::ArkSWConfig>;
fn to_ark(&self) -> Self::ArkEquivalent {
let proj_x = self
.x
.to_ark();
let proj_y = self
.y
.to_ark();
Self::ArkEquivalent::new_unchecked(proj_x, proj_y)
if *self == Self::zero() {
Self::ArkEquivalent::zero()
} else {
let ark_x = self
.x
.to_ark();
let ark_y = self
.y
.to_ark();
Self::ArkEquivalent::new_unchecked(ark_x, ark_y)
}
}
fn from_ark(ark: Self::ArkEquivalent) -> Self {

View File

@@ -16,6 +16,15 @@ use ark_ec::VariableBaseMSM;
#[cfg(feature = "arkworks")]
use ark_std::{rand::Rng, test_rng, UniformRand};
fn generate_random_affine_points_with_zeroes<C: Curve>(size: usize, num_zeroes: usize) -> Vec<Affine<C>> {
let rng = &mut test_rng();
let mut points = C::generate_random_affine_points(size);
for _ in 0..num_zeroes {
points[rng.gen_range(0..size)] = Affine::<C>::zero();
}
points
}
pub fn check_msm<C: Curve + MSM<C>>()
where
<C::ScalarField as FieldImpl>::Config: GenerateRandom<C::ScalarField>,
@@ -36,7 +45,7 @@ where
let test_sizes = [4, 8, 16, 32, 64, 128, 256, 1000, 1 << 18];
let mut msm_results = HostOrDeviceSlice::cuda_malloc(1).unwrap();
for test_size in test_sizes {
let points = C::generate_random_affine_points(test_size);
let points = generate_random_affine_points_with_zeroes(test_size, 2);
let scalars = <C::ScalarField as FieldImpl>::Config::generate_random(test_size);
let points_ark: Vec<_> = points
.iter()
@@ -101,7 +110,7 @@ where
let batch_sizes = [1, 3, 1 << 4];
for test_size in test_sizes {
for batch_size in batch_sizes {
let points = C::generate_random_affine_points(test_size);
let points = generate_random_affine_points_with_zeroes(test_size, 10);
let scalars = <C::ScalarField as FieldImpl>::Config::generate_random(test_size * batch_size);
// a version of batched msm without using `cfg.points_size`, requires copying bases
let points_cloned: Vec<Affine<C>> = std::iter::repeat(points.clone())
@@ -123,6 +132,7 @@ where
cfg.ctx
.stream = &stream;
cfg.is_async = true;
cfg.large_bucket_factor = 2;
msm(&scalars_h, &points_h, &cfg, &mut msm_results_1).unwrap();
msm(&scalars_h, &points_d, &cfg, &mut msm_results_2).unwrap();
@@ -170,15 +180,16 @@ where
C::ScalarField: ArkConvertible<ArkEquivalent = <C::ArkSWConfig as ArkCurveConfig>::ScalarField>,
C::BaseField: ArkConvertible<ArkEquivalent = <C::ArkSWConfig as ArkCurveConfig>::BaseField>,
{
let test_sizes = [1 << 6, 1000];
let test_threshold = 1 << 8;
let batch_sizes = [1, 3, 1 << 8];
let test_sizes = [1 << 10, 10000];
let test_threshold = 1 << 11;
let batch_sizes = [1, 3, 1 << 4];
let rng = &mut test_rng();
for test_size in test_sizes {
for batch_size in batch_sizes {
let points = C::generate_random_affine_points(test_size * batch_size);
let points = generate_random_affine_points_with_zeroes(test_size * batch_size, 100);
let mut scalars = vec![C::ScalarField::zero(); test_size * batch_size];
for _ in 0..(test_size * batch_size / 2) {
for _ in 0..(test_size * batch_size) {
scalars[rng.gen_range(0..test_size * batch_size)] = C::ScalarField::one();
}
for _ in test_threshold..test_size {

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-cuda-runtime"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = [ "Ingonyama" ]
description = "Ingonyama's Rust wrapper of CUDA runtime"

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-bls12-377"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = [ "Ingonyama" ]
description = "Rust wrapper for the CUDA implementation of BLS12-377 pairing friendly elliptic curve by Ingonyama"

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-bls12-381"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = [ "Ingonyama" ]
description = "Rust wrapper for the CUDA implementation of BLS12-381 pairing friendly elliptic curve by Ingonyama"

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-bn254"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = [ "Ingonyama" ]
description = "Rust wrapper for the CUDA implementation of BN254 pairing friendly elliptic curve by Ingonyama"

View File

@@ -1,6 +1,6 @@
[package]
name = "icicle-bw6-761"
version = "1.3.0"
version = "1.4.0"
edition = "2021"
authors = [ "Ingonyama" ]
description = "Rust wrapper for the CUDA implementation of BW6-761 pairing friendly elliptic curve by Ingonyama"