Compare commits

...

42 Commits

Author SHA1 Message Date
hadaringonyama
e80c42a0dd clean msm 2023-07-19 15:00:59 +03:00
hadaringonyama
3c8a1e6f31 fixed unsigned overflow 2023-07-18 12:52:11 +03:00
hadaringonyama
995af2b290 fixed unsigned overflow 2023-07-18 12:49:30 +03:00
hadaringonyama
b591301f64 number of large buckets fix, debug prints 2023-07-18 00:57:04 +03:00
hadaringonyama
b791381641 find_cutoff_kernel fix 2023-07-17 18:58:20 +03:00
hadaringonyama
00019166d1 streaming large buckets 2023-07-17 16:55:44 +03:00
hadaringonyama
4c0791eb98 small fix for size <2^16 2023-07-17 16:26:17 +03:00
hadaringonyama
dd4fa8e5f2 large buckets feature works (for relevant sizes) 2023-07-17 16:10:20 +03:00
hadaringonyama
659c883de2 adding cutoff kernel 2023-07-17 09:35:05 +03:00
hadaringonyama
8cc6e32fea sorting by size works 2023-07-13 22:46:50 +03:00
hadaringonyama
bfcfa3807c fix rust test 2023-07-13 13:14:09 +03:00
hadaringonyama
df0bdb82fb fix for msm with size not a power of 2 2023-07-13 12:56:25 +03:00
hadaringonyama
12eb53250c compue ones separately - works 2023-07-11 18:21:50 +03:00
hadaringonyama
d273bf6ffc zero scalars and zero buckets removed 2023-07-10 16:26:16 +03:00
hadaringonyama
61ddc32310 small fix to accumulation kernel 2023-07-09 18:21:54 +03:00
hadaringonyama
ac5047d0e4 ignore 2023-07-09 09:35:43 +03:00
hadaringonyama
ecb36fefeb passing rust test 2^24 with correctness, c=16 2023-07-06 13:13:06 +03:00
hadaringonyama
1785e793f1 Merge branch 'msm-fix16' into msm-performance 2023-07-06 10:38:07 +03:00
hadaringonyama
b39b529463 no device sync 2023-07-05 11:07:07 +03:00
hadaringonyama
86bee3af42 works without the sort - temp fix 2023-07-05 10:55:59 +03:00
hadaringonyama
dbd5cd4cbb fixing streams - WIP 2023-07-05 09:50:04 +03:00
hadaringonyama
a767c93564 any c -WIP 2023-07-05 09:01:06 +03:00
hadaringonyama
655f014dc2 merge with dev passes all tests 2023-07-02 16:13:51 +03:00
hadaringonyama
18c7cad89c Merge remote-tracking branch 'origin/dev' into msm-performance 2023-07-02 14:49:19 +03:00
hadaringonyama
1e9f628235 c=16 works 2023-07-02 12:07:46 +03:00
hadaringonyama
2e45ed1bd4 reduction - simple implementation, c=8 works 2023-06-29 19:12:53 +03:00
hadaringonyama
e828d1da2a c=8 works 2023-06-28 16:23:13 +03:00
hadaringonyama
233927668c triangle block mix fix 2023-06-28 12:31:38 +03:00
guy-ingo
1866df60f1 new kernels implementation, no correctness 2023-06-26 18:25:38 +03:00
guy-ingo
ccc8892a52 rectangle kernel works for powers of 2 2023-06-26 17:56:02 +03:00
guy-ingo
67e4ee2864 triangle kernel works as expected 2023-06-26 16:51:42 +03:00
guy-ingo
6c5fe47e55 big triangle replacment works 2023-06-25 15:28:56 +03:00
hadaringonyama
ed9de3d1e9 signed digits working 2023-06-20 14:48:13 +03:00
hadaringonyama
d01e0dbfb1 signed + top bm - WIP 2023-06-18 18:10:11 +03:00
hadaringonyama
6aa6fe0c1c adding sort by bucket size 2023-06-18 11:27:07 +03:00
hadaringonyama
a64df640de temp 2023-05-30 09:37:15 +03:00
hadaringonyama
1b2b9f2826 code cleaning 2023-05-29 09:22:10 +03:00
hadaringonyama
0a36a545bf remove short msm from extern call 2023-05-28 16:24:04 +03:00
hadaringonyama
407273dee3 bugs fixed 2023-05-28 16:13:47 +03:00
hadaringonyama
f55bd30e13 bugs fixed 2023-05-28 15:53:02 +03:00
HadarIngonyama
071c24ce5a supporting new curves (#74)
* Fix for local machines GoogleTest and CMake (#70)

GoogleTest fix, updated readme

* Supporting Additional Curves (#72)

* init commit - changes for supporting new curves

* refactor + additional curve (bls12-377 works, bn254 - not yet)

* general refactor + curves script + fixing bn245

* revert unnecessary changes + refactor new curve script

* add README and fix limbs_p=limbs_q case in python script

---------

Co-authored-by: Vitalii Hnatyk <vhnatyk@gmail.com>
Co-authored-by: guy-ingo <106763145+guy-ingo@users.noreply.github.com>
2023-05-15 15:51:48 +03:00
DmytroTym
08c34a5183 Fix for local machines GoogleTest and CMake (#70) (#73)
GoogleTest fix, updated readme

Co-authored-by: Vitalii Hnatyk <vhnatyk@gmail.com>
2023-05-15 15:23:06 +03:00
15 changed files with 2880 additions and 119 deletions

2
.gitignore vendored
View File

@@ -5,6 +5,8 @@
*.cubin
*.bin
*.fatbin
*.nsys-rep
*.ncu-rep
**/target
**/.vscode
**/.*lock*csv#

View File

@@ -1,4 +1,9 @@
test_msm:
mkdir -p work
nvcc -o work/test_msm -I. tests/msm_test.cu
work/test_msm
work/test_msm
test_msm_debug:
mkdir -p work
nvcc -o work/test_msm_debug -I. tests/msm_test.cu -g -G
work/test_msm_debug

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@
#pragma once
template <typename S, typename P, typename A>
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream);
template <typename S, typename P, typename A>
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
@@ -12,7 +12,7 @@ template <typename S, typename P, typename A>
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);
template <typename S, typename P, typename A>
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream);
template <typename S, typename P, typename A>
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);

View File

@@ -0,0 +1,899 @@
#ifndef MSM
#define MSM
#pragma once
#include <stdexcept>
#include <cuda.h>
#include <cooperative_groups.h>
#include "../../primitives/affine.cuh"
#include <iostream>
#include <vector>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_run_length_encode.cuh>
#include <cub/device/device_scan.cuh>
#include "../../utils/cuda_utils.cuh"
#include "../../primitives/projective.cuh"
#include "../../primitives/field.cuh"
#include "msm.cuh"
#define MAX_TH 256
// #define SIGNED_DIG //WIP
// #define BIG_TRIANGLE
// #define SSM_SUM //WIP
template <typename P>
__global__ void single_stage_multi_reduction_kernel(P *v, P *v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tid_p = padding? (tid/(2*padding))*padding + tid%padding: tid;
int jump =block_size/2;
int block_id = tid_p/jump;
int block_tid = tid_p%jump;
unsigned read_ind = block_size*block_id + block_tid;
unsigned write_ind = tid;
v_r[write_stride? ((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride : write_ind] = padding? (tid%(2*padding)<padding)? v[read_ind] + v[read_ind + jump] : P::zero() :v[read_ind] + v[read_ind + jump];
}
//this kernel performs single scalar multiplication
//each thread multilies a single scalar and point
template <typename P, typename S>
__global__ void ssm_kernel(S *scalars, P *points, P *results, unsigned N) {
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) results[tid] = scalars[tid]*points[tid];
}
//this kernel sums all the elements in a given vector using multiple threads
template <typename P>
__global__ void sum_reduction_kernel(P *v, P* v_r) {
unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
// Start at 1/2 block stride and divide by two each iteration
for (unsigned s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) {
v[tid] = v[tid] + v[tid + s];
}
__syncthreads();
}
// Let the thread 0 for this block write the final result
if (threadIdx.x == 0) {
v_r[blockIdx.x] = v[tid];
}
}
//this kernel initializes the buckets with zero points
//each thread initializes a different bucket
template <typename P>
__global__ void initialize_buckets_kernel(P *buckets, unsigned N) {
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) buckets[tid] = P::zero(); //zero point
}
//this kernel splits the scalars into digits of size c
//each thread splits a single scalar into nof_bms digits
template <typename S>
__global__ void split_scalars_kernel(unsigned *buckets_indices, unsigned *point_indices, S *scalars, unsigned total_size, unsigned msm_log_size, unsigned nof_bms, unsigned bm_bitsize, unsigned c, unsigned top_bm_nof_missing_bits){
constexpr unsigned sign_mask = 0x80000000;
// constexpr unsigned trash_bucket = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
unsigned bucket_index;
unsigned bucket_index2;
unsigned current_index;
unsigned msm_index = tid >> msm_log_size;
unsigned borrow = 0;
if (tid < total_size){
S scalar = scalars[tid];
for (unsigned bm = 0; bm < nof_bms; bm++)
{
bucket_index = scalar.get_scalar_digit(bm, c);
#ifdef SIGNED_DIG
bucket_index += borrow;
borrow = 0;
unsigned sign = 0;
if (bucket_index > (1<<(c-1))) {
bucket_index = (1 << c) - bucket_index;
borrow = 1;
sign = sign_mask;
}
#endif
current_index = bm * total_size + tid;
#ifdef SIGNED_DIG
point_indices[current_index] = sign | tid; //the point index is saved for later
#else
buckets_indices[current_index] = (msm_index<<(c+bm_bitsize)) | (bm<<c) | bucket_index; //the bucket module number and the msm number are appended at the msbs
if (scalar == S::zero() || scalar == S::one() || bucket_index==0) buckets_indices[current_index] = 0; //will be skipped
point_indices[current_index] = tid; //the point index is saved for later
#endif
}
}
}
template <typename P, typename A, typename S>
__global__ void add_ones_kernel(A *points, S* scalars, P* results, const unsigned msm_size, const unsigned run_length){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned nof_threads = (msm_size + run_length - 1)/run_length; //129256
if (tid>=nof_threads) {
results[tid] = P::zero();
return;
}
const unsigned start_index = tid*run_length;
P sum = P::zero();
for (int i=start_index;i<min(start_index+run_length,msm_size);i++){
if (scalars[i] == S::one()) sum = sum + points[i];
}
results[tid] = sum;
}
__global__ void find_cutoff_kernel(unsigned *v, unsigned size, unsigned cutoff, unsigned run_length, unsigned *result){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned nof_threads = (size + run_length - 1)/run_length;
if (tid>=nof_threads) {
return;
}
const unsigned start_index = tid*run_length;
for (int i=start_index;i<min(start_index+run_length,size-1);i++){
if (v[i] > cutoff && v[i+1] <= cutoff) {
result[0] = i+1;
return;
}
if (i == size - 1) {
result[0] = 0;
}
}
}
__global__ void find_max_size(unsigned *bucket_sizes,unsigned *single_bucket_indices,unsigned c, unsigned *largest_bucket_size){
for (int i=0;;i++){
if (single_bucket_indices[i]&((1<<c)-1)){
largest_bucket_size[0] = bucket_sizes[i];
largest_bucket_size[1] = i;
break;
}
}
}
//this kernel adds up the points in each bucket
// __global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets,
// unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__ point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){
template <typename P, typename A>
__global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets, unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, const unsigned *__restrict__ point_indices, A *__restrict__ points, const unsigned nof_buckets, const unsigned nof_buckets_to_compute, const unsigned msm_idx_shift, const unsigned c){
constexpr unsigned sign_mask = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>=nof_buckets_to_compute){
return;
}
if ((single_bucket_indices[tid]&((1<<c)-1))==0)
{
return; //skip zero buckets
}
#ifdef SIGNED_DIG //todo - fix
const unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift;
const unsigned bm_index = (single_bucket_indices[tid]&((1<<msm_idx_shift)-1))>>c;
const unsigned bucket_index = msm_index * nof_buckets + bm_index * ((1<<(c-1))+1) + (single_bucket_indices[tid]&((1<<c)-1));
#else
unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift;
unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid]&((1<<msm_idx_shift)-1));
#endif
const unsigned bucket_offset = bucket_offsets[tid];
const unsigned bucket_size = bucket_sizes[tid];
P bucket; //get rid of init buckets? no.. because what about buckets with no points
for (unsigned i = 0; i < bucket_size; i++) //add the relevant points starting from the relevant offset up to the bucket size
{
unsigned point_ind = point_indices[bucket_offset+i];
#ifdef SIGNED_DIG
unsigned sign = point_ind & sign_mask;
point_ind &= ~sign_mask;
A point = points[point_ind];
if (sign) point = A::neg(point);
#else
A point = points[point_ind];
#endif
bucket = i? bucket + point : P::from_affine(point);
}
buckets[bucket_index] = bucket;
}
template <typename P, typename A>
__global__ void accumulate_large_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets, unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, const unsigned *__restrict__ point_indices, A *__restrict__ points, const unsigned nof_buckets, const unsigned nof_buckets_to_compute, const unsigned msm_idx_shift, const unsigned c, const unsigned threads_per_bucket, const unsigned max_run_length){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
unsigned large_bucket_index = tid/threads_per_bucket;
unsigned bucket_segment_index = tid%threads_per_bucket;
if (tid>=nof_buckets_to_compute*threads_per_bucket){
return;
}
if ((single_bucket_indices[large_bucket_index]&((1<<c)-1))==0) //dont need
{
return; //skip zero buckets
}
unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index;
const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index*max_run_length;
const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index*max_run_length? bucket_sizes[large_bucket_index] - bucket_segment_index*max_run_length :0;
P bucket;
unsigned run_length = min(bucket_size,max_run_length);
for (unsigned i = 0; i < run_length; i++) //add the relevant points starting from the relevant offset up to the bucket size
{
unsigned point_ind = point_indices[bucket_offset+i];
A point = points[point_ind];
bucket = i? bucket + point : P::from_affine(point); //init empty buckets
}
buckets[write_bucket_index] = run_length? bucket : P::zero();
}
template <typename P>
__global__ void distribute_large_buckets_kernel(P* large_buckets, P* buckets, unsigned *single_bucket_indices, unsigned size){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>=size){
return;
}
buckets[single_bucket_indices[tid]] = large_buckets[tid];
}
//this kernel sums the entire bucket module
//each thread deals with a single bucket module
template <typename P>
__global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>=nof_bms) return;
#ifdef SIGNED_DIG
unsigned buckets_in_bm = (1<<c)+1;
#else
unsigned buckets_in_bm = (1<<c);
#endif
P line_sum = buckets[(tid+1)*buckets_in_bm-1];
final_sums[tid] = line_sum;
for (unsigned i = buckets_in_bm-2; i >0; i--)
{
line_sum = line_sum + buckets[tid*buckets_in_bm + i]; //using the running sum method
final_sums[tid] = final_sums[tid] + line_sum;
}
}
//this kernel uses single scalar multiplication to multiply each bucket by its index
//each thread deals with a single bucket
template <typename P, typename S>
__global__ void ssm_buckets_kernel(P* buckets, unsigned* single_bucket_indices, unsigned nof_buckets, unsigned c){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>nof_buckets) return;
unsigned bucket_index = single_bucket_indices[tid];
S scalar_bucket_multiplier;
scalar_bucket_multiplier = {bucket_index&((1<<c)-1), 0, 0, 0, 0, 0, 0, 0}; //the index without the bucket module index
buckets[bucket_index] = scalar_bucket_multiplier*buckets[bucket_index];
}
template <typename P>
__global__ void last_pass_kernel(P*final_buckets, P*final_sums, unsigned num_sums){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>num_sums) return;
final_sums[tid] = final_buckets[2*tid+1];
}
//this kernel computes the final result using the double and add algorithm
//it is done by a single thread
template <typename P, typename S>
__global__ void final_accumulation_kernel(P* final_sums, P* ones_result, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid>nof_msms) return;
P final_result = P::zero();
for (unsigned i = nof_bms; i >1; i--)
{
final_result = final_result + final_sums[i-1 + tid*nof_bms]; //add
for (unsigned j=0; j<c; j++) //double
{
final_result = final_result + final_result;
}
}
final_results[tid] = final_result + final_sums[tid*nof_bms] + ones_result[0];
// final_results[tid] = final_result + final_sums[tid*nof_bms];
}
//this function computes msm using the bucket method
template <typename S, typename P, typename A>
void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream) {
S *d_scalars;
A *d_points;
if (!on_device) {
//copy scalars and point to gpu
cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
cudaMallocAsync(&d_points, sizeof(A) * size, stream);
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
}
else {
d_scalars = scalars;
d_points = points;
}
P *buckets;
//compute number of bucket modules and number of buckets in each module
unsigned nof_bms = bitsize/c;
unsigned msm_log_size = ceil(log2(size));
unsigned bm_bitsize = ceil(log2(nof_bms));
if (bitsize%c){
nof_bms++;
}
unsigned top_bm_nof_missing_bits = c*nof_bms - bitsize;
#ifdef SIGNED_DIG
unsigned nof_buckets = nof_bms*((1<<(c-1))+1); //signed digits
#else
unsigned nof_buckets = nof_bms<<c;
#endif
cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream);
// launch the bucket initialization kernel with maximum threads
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, nof_buckets);
//accumulate ones
P *ones_results; //fix whole division, in last run in kernel too
const unsigned nof_runs = msm_log_size > 10? (1<<(msm_log_size-6)) : 16;
const unsigned run_length = (size + nof_runs -1)/nof_runs;
cudaMallocAsync(&ones_results, sizeof(P) * nof_runs, stream);
NUM_THREADS = min(1 << 8,nof_runs);
NUM_BLOCKS = (nof_runs + NUM_THREADS - 1) / NUM_THREADS;
add_ones_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_points, d_scalars, ones_results, size, run_length);
for (int s=nof_runs>>1;s>0;s>>=1){
NUM_THREADS = min(MAX_TH,s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(ones_results,ones_results,s*2,0,0,0);
}
unsigned *bucket_indices;
unsigned *point_indices;
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms+1), stream);
cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms+1), stream);
//split scalars into digits
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (size * (nof_bms+1) + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size,
nof_bms, bm_bitsize, c, top_bm_nof_missing_bits); //+size - leaving the first bm free for the out of place sort later
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
unsigned *sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices,
point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
for (unsigned i = 0; i < nof_bms; i++) {
unsigned offset_out = i * size;
unsigned offset_in = offset_out + size;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in, bucket_indices + offset_out,
point_indices + offset_in, point_indices + offset_out, size, 0, sizeof(unsigned) * 8, stream);
}
cudaFreeAsync(sort_indices_temp_storage, stream);
//find bucket_sizes
unsigned *single_bucket_indices;
unsigned *bucket_sizes;
unsigned *nof_buckets_to_compute;
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*nof_buckets, stream);
cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*nof_buckets, stream);
cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream);
unsigned *encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms*size, stream);
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms*size, stream);
cudaFreeAsync(encode_temp_storage, stream);
//get offsets - where does each new bucket begin
unsigned* bucket_offsets;
cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*nof_buckets, stream);
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
cudaFreeAsync(offsets_temp_storage, stream);
//sort by bucket sizes
unsigned h_nof_buckets_to_compute;
cudaMemcpyAsync(&h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
unsigned* sorted_bucket_sizes;
cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned)*h_nof_buckets_to_compute, stream);
unsigned* sorted_bucket_offsets;
cudaMallocAsync(&sorted_bucket_offsets, sizeof(unsigned)*h_nof_buckets_to_compute, stream);
unsigned* sort_offsets_temp_storage{};
size_t sort_offsets_temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes,
sorted_bucket_sizes, bucket_offsets, sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream);
cub::DeviceRadixSort::SortPairsDescending(sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes,
sorted_bucket_sizes, bucket_offsets, sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaFreeAsync(sort_offsets_temp_storage, stream);
unsigned* sorted_single_bucket_indices;
cudaMallocAsync(&sorted_single_bucket_indices, sizeof(unsigned)*h_nof_buckets_to_compute, stream);
unsigned* sort_single_temp_storage{};
size_t sort_single_temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes,
sorted_bucket_sizes, single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream);
cub::DeviceRadixSort::SortPairsDescending(sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes,
sorted_bucket_sizes, single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaFreeAsync(sort_single_temp_storage, stream);
//find large buckets
unsigned avarage_size = size/(1<<c);
// printf("avarage_size %u\n", avarage_size);
float large_bucket_factor = 10; //global param
unsigned bucket_th = ceil(large_bucket_factor*avarage_size);
// printf("bucket_th %u\n", bucket_th);
unsigned *nof_large_buckets;
cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream);
unsigned TOTAL_THREADS = 129000; //todo - device dependant
unsigned cutoff_run_length = max(2,h_nof_buckets_to_compute/TOTAL_THREADS);
unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length -1)/cutoff_run_length;
NUM_THREADS = min(1 << 5,cutoff_nof_runs);
NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
find_cutoff_kernel<<<NUM_BLOCKS,NUM_THREADS,0,stream>>>(sorted_bucket_sizes,h_nof_buckets_to_compute,bucket_th,cutoff_run_length,nof_large_buckets);
unsigned h_nof_large_buckets;
cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
unsigned *max_res;
cudaMallocAsync(&max_res, sizeof(unsigned)*2, stream);
find_max_size<<<1,1,0,stream>>>(sorted_bucket_sizes,sorted_single_bucket_indices,c,max_res);
unsigned h_max_res[2];
cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned)*2, cudaMemcpyDeviceToHost, stream);
// printf("h_nof_large_buckets %u\n", h_nof_large_buckets);
unsigned h_largest_bucket_size = h_max_res[0];
unsigned h_nof_zero_large_buckets = h_max_res[1];
// printf("h_largest_bucket_size %u\n", h_largest_bucket_size);
// printf("h_nof_zero_large_buckets %u\n", h_nof_zero_large_buckets);
unsigned large_buckets_to_compute = h_nof_large_buckets>h_nof_zero_large_buckets? h_nof_large_buckets-h_nof_zero_large_buckets : 0;
cudaStream_t stream2;
cudaStreamCreate(&stream2);
P* large_buckets;
if (large_buckets_to_compute>0 && bucket_th>0){
unsigned threads_per_bucket = 1<<(unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); //global param
unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
// printf("threads_per_bucket %u\n", threads_per_bucket);
// printf("max_bucket_size_run_length %u\n", max_bucket_size_run_length);
unsigned total_large_buckets_size = large_buckets_to_compute*threads_per_bucket;
cudaMallocAsync(&large_buckets, sizeof(P)*total_large_buckets_size, stream);
NUM_THREADS = min(1 << 8,total_large_buckets_size);
NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(large_buckets, sorted_bucket_offsets+h_nof_zero_large_buckets, sorted_bucket_sizes+h_nof_zero_large_buckets, sorted_single_bucket_indices+h_nof_zero_large_buckets, point_indices,
d_points, nof_buckets, large_buckets_to_compute, c+bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);
//reduce
for (int s=total_large_buckets_size>>1;s>large_buckets_to_compute-1;s>>=1){
NUM_THREADS = min(MAX_TH,s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream2>>>(large_buckets,large_buckets,s*2,0,0,0);
}
//distribute
NUM_THREADS = min(MAX_TH,large_buckets_to_compute);
NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS;
distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream2>>>(large_buckets,buckets,sorted_single_bucket_indices+h_nof_zero_large_buckets,large_buckets_to_compute);
}
else{
h_nof_large_buckets = 0;
}
//launch the accumulation kernel with maximum threads
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (h_nof_buckets_to_compute-h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets+h_nof_large_buckets, sorted_bucket_sizes+h_nof_large_buckets, sorted_single_bucket_indices+h_nof_large_buckets, point_indices,
d_points, nof_buckets, h_nof_buckets_to_compute-h_nof_large_buckets, c+bm_bitsize, c);
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream2);
cudaDeviceSynchronize();
#ifdef SSM_SUM
//sum each bucket
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
ssm_buckets_kernel<fake_point, fake_scalar><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
//sum each bucket module
P* final_results;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
NUM_THREADS = 1<<c;
NUM_BLOCKS = nof_bms;
sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
#endif
P* final_results;
if (big_triangle){
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
//launch the bucket module sum kernel - a thread for each bucket module
NUM_THREADS = nof_bms;
NUM_BLOCKS = 1;
#ifdef SIGNED_DIG
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c-1); //sighed digits
#else
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
#endif
}
else{
unsigned source_bits_count = c;
bool odd_source_c = source_bits_count%2;
unsigned source_windows_count = nof_bms;
unsigned source_buckets_count = nof_buckets;
P *source_buckets = buckets;
buckets = nullptr;
P *target_buckets;
P *temp_buckets1;
P *temp_buckets2;
for (unsigned i = 0;; i++) {
// printf("round %u \n" ,i);
const unsigned target_bits_count = (source_bits_count + 1) >> 1; //c/2=8
// printf("target_bits_count %u \n" ,target_bits_count);
const unsigned target_windows_count = source_windows_count << 1; //nof bms*2 = 32
const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count,stream); //32*2^8*2^7 buckets
cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets
cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets
if (source_bits_count>0){
for(unsigned j=0;j<target_bits_count;j++){
unsigned last_j = target_bits_count-1;
NUM_THREADS = min(MAX_TH,(source_buckets_count>>(1+j)));
NUM_BLOCKS = ((source_buckets_count>>(1+j)) + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets1,j==target_bits_count-1? target_buckets: temp_buckets1,1<<(source_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,0,0);
unsigned nof_threads = (source_buckets_count>>(1+j));
NUM_THREADS = min(MAX_TH,nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0);
}
}
if (target_bits_count == 1) {
nof_bms = bitsize;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
NUM_THREADS = 32;
NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(target_buckets,final_results,nof_bms);
c = 1;
cudaFreeAsync(source_buckets,stream);
cudaFreeAsync(target_buckets,stream);
cudaFreeAsync(temp_buckets1,stream);
cudaFreeAsync(temp_buckets2,stream);
break;
}
cudaFreeAsync(source_buckets,stream);
cudaFreeAsync(temp_buckets1,stream);
cudaFreeAsync(temp_buckets2,stream);
source_buckets = target_buckets;
target_buckets = nullptr;
temp_buckets1 = nullptr;
temp_buckets2 = nullptr;
source_bits_count = target_bits_count;
odd_source_c = source_bits_count%2;
source_windows_count = target_windows_count;
source_buckets_count = target_buckets_count;
}
}
P* d_final_result;
if (!on_device)
cudaMallocAsync(&d_final_result, sizeof(P), stream);
//launch the double and add kernel, a single thread
final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
cudaStreamSynchronize(stream);
if (!on_device)
cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
//free memory
if (!on_device) {
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_scalars, stream);
cudaFreeAsync(d_final_result, stream);
}
cudaFreeAsync(buckets, stream);
#ifndef PHASE1_TEST
cudaFreeAsync(bucket_indices, stream);
cudaFreeAsync(point_indices, stream);
cudaFreeAsync(single_bucket_indices, stream);
cudaFreeAsync(bucket_sizes, stream);
cudaFreeAsync(nof_buckets_to_compute, stream);
cudaFreeAsync(bucket_offsets, stream);
#endif
cudaFreeAsync(sorted_bucket_sizes,stream);
cudaFreeAsync(sorted_bucket_offsets,stream);
cudaFreeAsync(sorted_single_bucket_indices,stream);
cudaFreeAsync(nof_large_buckets,stream);
cudaFreeAsync(max_res,stream);
if (large_buckets_to_compute>0 && bucket_th>0) cudaFreeAsync(large_buckets,stream);
cudaFreeAsync(final_results, stream);
cudaFreeAsync(ones_results, stream);
cudaStreamSynchronize(stream);
}
//this function computes multiple msms using the bucket method - currently isn't working on this branch
template <typename S, typename P, typename A>
void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream){
unsigned total_size = batch_size * msm_size;
S *d_scalars;
A *d_points;
if (!on_device) {
//copy scalars and point to gpu
cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream);
cudaMallocAsync(&d_points, sizeof(A) * total_size, stream);
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice, stream);
}
else {
d_scalars = scalars;
d_points = points;
}
P *buckets;
//compute number of bucket modules and number of buckets in each module
unsigned nof_bms = bitsize/c;
if (bitsize%c){
nof_bms++;
}
unsigned msm_log_size = ceil(log2(msm_size));
unsigned bm_bitsize = ceil(log2(nof_bms));
unsigned nof_buckets = (nof_bms<<c);
unsigned total_nof_buckets = nof_buckets*batch_size;
cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream);
//lanch the bucket initialization kernel with maximum threads
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);
unsigned *bucket_indices;
unsigned *point_indices;
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
cudaMallocAsync(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
//split scalars into digits
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size,
msm_log_size, nof_bms, bm_bitsize, c,0); //+size - leaving the first bm free for the out of place sort later
//sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
unsigned *sorted_bucket_indices;
unsigned *sorted_point_indices;
cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
unsigned *sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream);
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more info
cub::DeviceRadixSort::SortPairs(sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned)*8, stream);
cudaFreeAsync(sort_indices_temp_storage, stream);
//find bucket_sizes
unsigned *single_bucket_indices;
unsigned *bucket_sizes;
unsigned *total_nof_buckets_to_compute;
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned)*total_nof_buckets, stream);
cudaMallocAsync(&bucket_sizes, sizeof(unsigned)*total_nof_buckets, stream);
cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream);
unsigned *encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
total_nof_buckets_to_compute, nof_bms*total_size, stream);
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
cub::DeviceRunLengthEncode::Encode(encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
total_nof_buckets_to_compute, nof_bms*total_size, stream);
cudaFreeAsync(encode_temp_storage, stream);
//get offsets - where does each new bucket begin
unsigned* bucket_offsets;
cudaMallocAsync(&bucket_offsets, sizeof(unsigned)*total_nof_buckets, stream);
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
cudaFreeAsync(offsets_temp_storage, stream);
//launch the accumulation kernel with maximum threads
// NUM_THREADS = 1 << 8;
// NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
// accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices,
// d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize,c);
// #ifdef SSM_SUM
// //sum each bucket
// NUM_THREADS = 1 << 10;
// NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
// ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);
// //sum each bucket module
// P* final_results;
// cudaMalloc(&final_results, sizeof(P) * nof_bms);
// NUM_THREADS = 1<<c;
// NUM_BLOCKS = nof_bms;
// sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
// #endif
// #ifdef BIG_TRIANGLE
P* bm_sums;
cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream);
//launch the bucket module sum kernel - a thread for each bucket module
NUM_THREADS = 1<<8;
NUM_BLOCKS = (nof_bms*batch_size + NUM_THREADS - 1) / NUM_THREADS;
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bm_sums, nof_bms*batch_size, c);
// #endif
P* d_final_results;
if (!on_device)
cudaMallocAsync(&d_final_results, sizeof(P)*batch_size, stream);
//launch the double and add kernel, a single thread for each msm
NUM_THREADS = 1<<8;
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS>>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
//copy final result to host
if (!on_device)
cudaMemcpyAsync(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost, stream);
//free memory
if (!on_device) {
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_scalars, stream);
cudaFreeAsync(d_final_results, stream);
}
cudaFreeAsync(buckets, stream);
cudaFreeAsync(bucket_indices, stream);
cudaFreeAsync(point_indices, stream);
cudaFreeAsync(sorted_bucket_indices, stream);
cudaFreeAsync(sorted_point_indices, stream);
cudaFreeAsync(single_bucket_indices, stream);
cudaFreeAsync(bucket_sizes, stream);
cudaFreeAsync(total_nof_buckets_to_compute, stream);
cudaFreeAsync(bucket_offsets, stream);
cudaFreeAsync(bm_sums, stream);
cudaStreamSynchronize(stream);
}
//this kernel converts affine points to projective points
//each thread deals with a single point
template <typename P, typename A>
__global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N){
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) proj_points[tid] = P::from_affine(affine_points[tid]);
}
//the function computes msm using ssm
template <typename S, typename P, typename A>
void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream){ //works up to 2^8
S *scalars;
A *a_points;
P *p_points;
P *results;
cudaMallocAsync(&scalars, sizeof(S) * size, stream);
cudaMallocAsync(&a_points, sizeof(A) * size, stream);
cudaMallocAsync(&p_points, sizeof(P) * size, stream);
cudaMallocAsync(&results, sizeof(P) * size, stream);
//copy inputs to device
cudaMemcpyAsync(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
//convert to projective representation and multiply each point by its scalar using single scalar multiplication
unsigned NUM_THREADS = size;
to_proj_kernel<<<1,NUM_THREADS, 0, stream>>>(a_points, p_points, size);
ssm_kernel<<<1,NUM_THREADS, 0, stream>>>(scalars, p_points, results, size);
P *final_result;
cudaMallocAsync(&final_result, sizeof(P), stream);
//assuming msm size is a power of 2
//sum all the ssm results
NUM_THREADS = size;
sum_reduction_kernel<<<1,NUM_THREADS, 0, stream>>>(results, final_result);
//copy result to host
cudaStreamSynchronize(stream);
cudaMemcpyAsync(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
//free memory
cudaFreeAsync(scalars, stream);
cudaFreeAsync(a_points, stream);
cudaFreeAsync(p_points, stream);
cudaFreeAsync(results, stream);
cudaFreeAsync(final_result, stream);
}
//the function computes msm on the host using the naive method
template <typename A, typename S, typename P>
void reference_msm(S* scalars, A* a_points, unsigned size){
P *points = new P[size];
// P points[size];
for (unsigned i = 0; i < size ; i++)
{
points[i] = P::from_affine(a_points[i]);
}
P res = P::zero();
for (unsigned i = 0; i < size; i++)
{
res = res + scalars[i]*points[i];
}
std::cout<<"reference results"<<std::endl;
std::cout<<P::to_affine(res)<<std::endl;
}
unsigned get_optimal_c(const unsigned size) {
if (size < 17)
return 1;
// return 17;
return ceil(log2(size))-4;
}
//this function is used to compute msms of size larger than 256
template <typename S, typename P, typename A>
void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream){
unsigned c = 16;
unsigned bitsize = S::NBITS;
bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, big_triangle, stream);
}
// this function is used to compute a batches of msms of size larger than 256 - currently isn't working on this branch
template <typename S, typename P, typename A>
void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream){
unsigned c = get_optimal_c(msm_size);
unsigned bitsize = 255;
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream);
}
#endif

View File

@@ -1,19 +1,33 @@
#define G2_DEFINED
#include <iostream>
#include <chrono>
#include <vector>
#include "msm.cu"
#include "msm_clean.cu"
#include "../../utils/cuda_utils.cuh"
#include "../../primitives/projective.cuh"
#include "../../primitives/field.cuh"
#include "../../curves/bls12_381/curve_config.cuh"
// #include "../../curves/bls12_377/curve_config.cuh"
#include "../../curves/bn254/curve_config.cuh"
using namespace BLS12_381;
// using namespace BLS12_377;
using namespace BN254;
class Dummy_Scalar {
public:
static constexpr unsigned NBITS = 32;
unsigned x;
// unsigned p = 10;
unsigned p = 1<<30;
static HOST_DEVICE_INLINE Dummy_Scalar zero() {
return {0};
}
static HOST_DEVICE_INLINE Dummy_Scalar one() {
return {1};
}
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) {
os << scalar.x;
@@ -25,7 +39,7 @@ class Dummy_Scalar {
}
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) {
return {p1.x+p2.x};
return {(p1.x+p2.x)%p1.p};
}
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) {
@@ -36,11 +50,13 @@ class Dummy_Scalar {
return (p1.x == p2);
}
// static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) {
// return {Dummy_Scalar::neg(point.x)};
// }
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) {
return {scalar.p-scalar.x};
}
static HOST_INLINE Dummy_Scalar rand_host() {
return {(unsigned)rand()};
return {(unsigned)rand()%(1<<30)};
// return {(unsigned)rand()%10};
// return {(unsigned)rand()};
}
};
@@ -53,6 +69,10 @@ class Dummy_Projective {
return {0};
}
static HOST_DEVICE_INLINE Dummy_Projective one() {
return {1};
}
static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) {
return {point.x};
}
@@ -61,9 +81,9 @@ class Dummy_Projective {
return {point.x};
}
// static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) {
// return {Dummy_Scalar::neg(point.x)};
// }
static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) {
return {Dummy_Scalar::neg(point.x)};
}
friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) {
return {p1.x+p2.x};
@@ -103,7 +123,8 @@ class Dummy_Projective {
}
static HOST_INLINE Dummy_Projective rand_host() {
return {(unsigned)rand()};
return {(unsigned)rand()%10};
// return {(unsigned)rand()};
}
};
@@ -113,68 +134,110 @@ typedef scalar_t test_scalar;
typedef projective_t test_projective;
typedef affine_t test_affine;
// typedef scalar_t test_scalar;
// typedef g2_projective_t test_projective;
// typedef g2_affine_t test_affine;
// typedef Dummy_Scalar test_scalar;
// typedef Dummy_Projective test_projective;
// typedef Dummy_Projective test_affine;
int main()
{
unsigned batch_size = 4;
unsigned msm_size = 1<<15;
bool on_device = false;
unsigned batch_size = 1;
unsigned msm_size = (1<<24) -1;
// unsigned msm_size = 9215384;
// unsigned msm_size = (1<<10) - 456;
// unsigned msm_size = 20;
// unsigned msm_size = 6075005;
unsigned N = batch_size*msm_size;
test_scalar *scalars = new test_scalar[N];
test_affine *points = new test_affine[N];
for (unsigned i=0;i<N;i++){
scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
// scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
points[i] = (i%msm_size < 10)? test_projective::to_affine(test_projective::rand_host()): points[i-10];
// scalars[i] = test_scalar::rand_host();
// scalars[i] = i >462560? test_scalar::rand_host() : (test_scalar::one() + test_scalar::one());
scalars[i] = i >20? test_scalar::rand_host() : i>50? (test_scalar::one() + test_scalar::one()) : (test_scalar::one() + test_scalar::one()+ test_scalar::one());
// points[i] = test_projective::to_affine(test_projective::rand_host());
}
std::cout<<"finished generating"<<std::endl;
test_scalar *d_scalars;
test_affine *d_points;
if (on_device) {
//copy scalars and point to gpu
cudaMalloc(&d_scalars, sizeof(test_scalar) * N);
cudaMalloc(&d_points, sizeof(test_affine) * N);
cudaMemcpy(d_scalars, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_points, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice);
}
// projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
// test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
test_projective large_res[batch_size];
test_projective batched_large_res[batch_size];
test_projective large_res[batch_size*2];
test_projective *d_large_res;
cudaMalloc(&d_large_res, sizeof(test_projective) * batch_size*2);
// test_projective batched_large_res[batch_size];
// fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
// fake_point batched_large_res[256];
// short_msm<scalar_t, projective_t, affine_t>(scalars, points, N, short_res);
for (unsigned i=0;i<batch_size;i++){
large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
// for (unsigned i=0;i<batch_size;i++){
// large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
// std::cout<<"final result large"<<std::endl;
// std::cout<<test_projective::to_affine(*large_res)<<std::endl;
}
// }
auto begin = std::chrono::high_resolution_clock::now();
batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
// large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false);
// batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// large_msm<test_scalar, test_projective, test_affine>(on_device? d_scalars : scalars, on_device? d_points : points, msm_size, on_device? d_large_res : large_res, on_device, true,stream1);
// std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
large_msm<test_scalar, test_projective, test_affine>(on_device? d_scalars : scalars, on_device? d_points : points, msm_size, on_device? d_large_res+1 : large_res+1, on_device, false,stream2);
// test_reduce_triangle(scalars);
// test_reduce_rectangle(scalars);
// test_reduce_single(scalars);
// test_reduce_var(scalars);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
if (on_device)
cudaMemcpy(large_res, d_large_res, sizeof(test_projective) * batch_size*2, cudaMemcpyDeviceToHost);
// std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
std::cout<<test_projective::to_affine(large_res[1])<<std::endl;
// reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
std::cout<<"final results batched large"<<std::endl;
bool success = true;
for (unsigned i = 0; i < batch_size; i++)
{
std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
std::cout<<"good"<<std::endl;
}
else{
std::cout<<"miss"<<std::endl;
std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
success = false;
}
}
if (success){
std::cout<<"success!"<<std::endl;
}
// std::cout<<"final results batched large"<<std::endl;
// bool success = true;
// for (unsigned i = 0; i < batch_size; i++)
// {
// std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
// if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
// std::cout<<"good"<<std::endl;
// }
// else{
// std::cout<<"miss"<<std::endl;
// std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
// success = false;
// }
// }
// if (success){
// std::cout<<"success!"<<std::endl;
// }
// std::cout<<batched_large_res[0]<<std::endl;
// std::cout<<batched_large_res[1]<<std::endl;

View File

@@ -12,7 +12,7 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[
{
try
{
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, stream);
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, false, stream);
return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
@@ -53,7 +53,7 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
{
try
{
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, stream);
large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, false, stream);
cudaStreamSynchronize(stream);
return 0;
}

View File

@@ -12,7 +12,7 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[
{
try
{
large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, stream);
large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, false, stream);
return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:
{
try
{
large_msm(d_scalars, d_points, count, d_out, true, stream);
large_msm(d_scalars, d_points, count, d_out, true, false, stream);
cudaStreamSynchronize(stream);
return 0;
}

View File

@@ -2,4 +2,4 @@
#include "lde.cu"
#include "msm.cu"
#include "ve_mod_mult.cu"
#include "poseidon.cu"
#include "poseidon.cu"

View File

@@ -2,6 +2,9 @@
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#if defined(G2_DEFINED)
#include "../../primitives/extension_field.cuh"
#endif
#include "params.cuh"

View File

@@ -12,7 +12,7 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
{
try
{
large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, stream);
large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, false, stream);
return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po
{
try
{
large_msm(d_scalars, d_points, count, d_out, true, stream);
large_msm(d_scalars, d_points, count, d_out, true, false, stream);
cudaStreamSynchronize(stream);
return 0;
}

View File

@@ -16,4 +16,4 @@ extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective
!((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) &&
!((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero()));
}
#endif
#endif

View File

@@ -11,7 +11,7 @@ int msm_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *out, ${CURVE_NAME_U}
{
try
{
large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, stream);
large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, false, stream);
return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* out
{
try
{
large_msm(d_scalars, d_points, count, d_out, true, stream);
large_msm(d_scalars, d_points, count, d_out, true, false, stream);
cudaStreamSynchronize(stream);
return 0;
}

View File

@@ -16,4 +16,4 @@ extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1,
!((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) &&
!((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero()));
}
#endif
#endif

View File

@@ -756,12 +756,55 @@ pub fn generate_random_points_bn254(
.collect()
}
pub fn generate_random_points100_bn254(
count: usize,
mut rng: Box<dyn RngCore>,
) -> Vec<PointAffineNoInfinity_BN254> {
let mut res = Vec::new();
for i in 0..count{
if (i<100) {
res.push(Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z());
}
else {
res.push(res[i-100]);
}
}
return res;
}
pub fn generate_random_points_proj_bn254(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point_BN254> {
(0..count)
.map(|_| Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)))
.collect()
}
use rand::Rng;
pub fn generate_random_scalars_skewed_bn254(
count: usize,
mut rng: Box<dyn RngCore>,
) -> Vec<ScalarField_BN254> {
let mut res = Vec::new();
let mut nof_repeating_scalars;
let mut rng2 = rand::thread_rng();
nof_repeating_scalars = rng2.gen_range(5..50);
println!("{} nof_repeating_scalars", nof_repeating_scalars);
for i in 0..count{
if (i<nof_repeating_scalars) {
res.push(ScalarField_BN254::from_ark(Fr_BN254::rand(&mut rng).into_repr()));
}
else if i < 200000 {
res.push(res[i-nof_repeating_scalars]);
}
else {
res.push(ScalarField_BN254::from_ark(Fr_BN254::rand(&mut rng).into_repr()));
}
}
return res;
}
pub fn generate_random_scalars_bn254(count: usize, mut rng: Box<dyn RngCore>) -> Vec<ScalarField_BN254> {
(0..count)
.map(|_| ScalarField_BN254::from_ark(Fr_BN254::rand(&mut rng).into_repr()))
@@ -868,13 +911,16 @@ pub(crate) mod tests_bn254 {
#[test]
fn test_msm() {
let test_sizes = [6, 9];
let test_sizes = [131071,9215384, 6075005, 6699232, 12180757];
for pow2 in test_sizes {
let count = 1 << pow2;
for count in test_sizes {
// for pow2 in test_sizes {
// let count = 1 << pow2;
let seed = None; // set Some to provide seed
let points = generate_random_points_bn254(count, get_rng_bn254(seed));
let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed));
// let points = generate_random_points_bn254(count, get_rng_bn254(seed));
let points = generate_random_points100_bn254(count, get_rng_bn254(seed));
// let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed));
let scalars = generate_random_scalars_skewed_bn254(count, get_rng_bn254(seed));
let msm_result = msm_bn254(&points, &scalars, 0);
@@ -920,10 +966,11 @@ pub(crate) mod tests_bn254 {
#[test]
fn test_commit() {
let test_size = 1 << 8;
let test_size = 1 << 24;
let seed = Some(0);
let (mut scalars, mut d_scalars, _) = set_up_scalars_bn254(test_size, 0, false);
let mut points = generate_random_points_bn254(test_size, get_rng_bn254(seed));
// let mut points = generate_random_points_bn254(test_size, get_rng_bn254(seed));
let mut points = generate_random_points100_bn254(test_size, get_rng_bn254(seed));
let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
let msm_result = msm_bn254(&points, &scalars, 0);