Files
icicle/icicle/appUtils/msm/msm.cu
liuxiao fd62fe5ae8 Support bw6-761 (#188)
Resolves #191 and #113

---------

Co-authored-by: DmytroTym <dmytrotym1@gmail.com>
Co-authored-by: ImmanuelSegol <3ditds@gmail.com>
2023-10-21 18:49:06 +03:00

1003 lines
41 KiB
Plaintext

#ifndef MSM
#define MSM
#pragma once
#include "../../primitives/affine.cuh"
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/cuda_utils.cuh"
#include "../../utils/error_handler.cuh"
#include "msm.cuh"
#include <cooperative_groups.h>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_run_length_encode.cuh>
#include <cub/device/device_scan.cuh>
#include <cuda.h>
#include <iostream>
#include <stdexcept>
#include <vector>
#define MAX_TH 256
// #define SIGNED_DIG //WIP
// #define BIG_TRIANGLE
// #define SSM_SUM //WIP
template <typename P>
__global__ void single_stage_multi_reduction_kernel(
P* v,
P* v_r,
unsigned block_size,
unsigned write_stride,
unsigned write_phase,
unsigned padding,
unsigned num_of_threads)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= num_of_threads) { return; }
int jump = block_size / 2;
int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
int block_id = tid_p / jump;
int block_tid = tid_p % jump;
unsigned read_ind = block_size * block_id + block_tid;
unsigned write_ind = tid;
unsigned v_r_key =
write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind;
P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
: v[read_ind] + v[read_ind + jump];
v_r[v_r_key] = v_r_value;
}
// this kernel performs single scalar multiplication
// each thread multilies a single scalar and point
template <typename P, typename S>
__global__ void ssm_kernel(S* scalars, P* points, P* results, unsigned N)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) results[tid] = scalars[tid] * points[tid];
}
// this kernel sums all the elements in a given vector using multiple threads
template <typename P>
__global__ void sum_reduction_kernel(P* v, P* v_r)
{
unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
// Start at 1/2 block stride and divide by two each iteration
for (unsigned s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) { v[tid] = v[tid] + v[tid + s]; }
__syncthreads();
}
// Let the thread 0 for this block write the final result
if (threadIdx.x == 0) { v_r[blockIdx.x] = v[tid]; }
}
// this kernel initializes the buckets with zero points
// each thread initializes a different bucket
template <typename P>
__global__ void initialize_buckets_kernel(P* buckets, unsigned N)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) buckets[tid] = P::zero(); // zero point
}
// this kernel splits the scalars into digits of size c
// each thread splits a single scalar into nof_bms digits
template <typename S>
__global__ void split_scalars_kernel(
unsigned* buckets_indices,
unsigned* point_indices,
S* scalars,
unsigned total_size,
unsigned msm_log_size,
unsigned nof_bms,
unsigned bm_bitsize,
unsigned c)
{
constexpr unsigned sign_mask = 0x80000000;
// constexpr unsigned trash_bucket = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
unsigned bucket_index;
unsigned bucket_index2;
unsigned current_index;
unsigned msm_index = tid >> msm_log_size;
unsigned borrow = 0;
if (tid < total_size) {
S scalar = scalars[tid];
for (unsigned bm = 0; bm < nof_bms; bm++) {
bucket_index = scalar.get_scalar_digit(bm, c);
#ifdef SIGNED_DIG
bucket_index += borrow;
borrow = 0;
unsigned sign = 0;
if (bucket_index > (1 << (c - 1))) {
bucket_index = (1 << c) - bucket_index;
borrow = 1;
sign = sign_mask;
}
#endif
current_index = bm * total_size + tid;
#ifdef SIGNED_DIG
point_indices[current_index] = sign | tid; // the point index is saved for later
#else
buckets_indices[current_index] =
(msm_index << (c + bm_bitsize)) | (bm << c) |
bucket_index; // the bucket module number and the msm number are appended at the msbs
if (scalar == S::zero() || scalar == S::one() || bucket_index == 0)
buckets_indices[current_index] = 0; // will be skipped
point_indices[current_index] = tid; // the point index is saved for later
#endif
}
}
}
template <typename P, typename A, typename S>
__global__ void add_ones_kernel(A* points, S* scalars, P* results, const unsigned msm_size, const unsigned run_length)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned nof_threads = (msm_size + run_length - 1) / run_length; // 129256
if (tid >= nof_threads) {
results[tid] = P::zero();
return;
}
const unsigned start_index = tid * run_length;
P sum = P::zero();
for (int i = start_index; i < min(start_index + run_length, msm_size); i++) {
if (scalars[i] == S::one()) sum = sum + points[i];
}
results[tid] = sum;
}
template <typename S>
__global__ void
find_cutoff_kernel(unsigned* v, unsigned size, unsigned cutoff, unsigned run_length, S* fake_param, unsigned* result)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned nof_threads = (size + run_length - 1) / run_length;
if (tid >= nof_threads) { return; }
const unsigned start_index = tid * run_length;
for (int i = start_index; i < min(start_index + run_length, size - 1); i++) {
if (v[i] > cutoff && v[i + 1] <= cutoff) {
result[0] = i + 1;
return;
}
}
if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; }
}
template <typename S>
__global__ void find_max_size(
unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, S* fake_param, unsigned* largest_bucket_size)
{
for (int i = 0;; i++) {
if (single_bucket_indices[i] & ((1 << c) - 1)) {
largest_bucket_size[0] = bucket_sizes[i];
largest_bucket_size[1] = i;
break;
}
}
}
// this kernel adds up the points in each bucket
// __global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets,
// unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__
// point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){
template <typename P, typename A>
__global__ void accumulate_buckets_kernel(
P* __restrict__ buckets,
unsigned* __restrict__ bucket_offsets,
unsigned* __restrict__ bucket_sizes,
unsigned* __restrict__ single_bucket_indices,
const unsigned* __restrict__ point_indices,
A* __restrict__ points,
const unsigned nof_buckets,
const unsigned nof_buckets_to_compute,
const unsigned msm_idx_shift,
const unsigned c)
{
constexpr unsigned sign_mask = 0x80000000;
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_buckets_to_compute) return;
if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) {
return; // skip zero buckets
}
#ifdef SIGNED_DIG // todo - fix
const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c;
const unsigned bucket_index =
msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1));
#else
unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
#endif
const unsigned bucket_offset = bucket_offsets[tid];
const unsigned bucket_size = bucket_sizes[tid];
P bucket; // get rid of init buckets? no.. because what about buckets with no points
for (unsigned i = 0; i < bucket_size;
i++) { // add the relevant points starting from the relevant offset up to the bucket size
unsigned point_ind = point_indices[bucket_offset + i];
#ifdef SIGNED_DIG
unsigned sign = point_ind & sign_mask;
point_ind &= ~sign_mask;
A point = points[point_ind];
if (sign) point = A::neg(point);
#else
A point = points[point_ind];
#endif
bucket = i ? bucket + point : P::from_affine(point);
}
buckets[bucket_index] = bucket;
}
template <typename P, typename A>
__global__ void accumulate_large_buckets_kernel(
P* __restrict__ buckets,
unsigned* __restrict__ bucket_offsets,
unsigned* __restrict__ bucket_sizes,
unsigned* __restrict__ single_bucket_indices,
const unsigned* __restrict__ point_indices,
A* __restrict__ points,
const unsigned nof_buckets,
const unsigned nof_buckets_to_compute,
const unsigned msm_idx_shift,
const unsigned c,
const unsigned threads_per_bucket,
const unsigned max_run_length)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
unsigned large_bucket_index = tid / threads_per_bucket;
unsigned bucket_segment_index = tid % threads_per_bucket;
if (tid >= nof_buckets_to_compute * threads_per_bucket) { return; }
if ((single_bucket_indices[large_bucket_index] & ((1 << c) - 1)) == 0) { // dont need
return; // skip zero buckets
}
unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index;
const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index * max_run_length;
const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index * max_run_length
? bucket_sizes[large_bucket_index] - bucket_segment_index * max_run_length
: 0;
P bucket;
unsigned run_length = min(bucket_size, max_run_length);
for (unsigned i = 0; i < run_length;
i++) { // add the relevant points starting from the relevant offset up to the bucket size
unsigned point_ind = point_indices[bucket_offset + i];
A point = points[point_ind];
bucket = i ? bucket + point : P::from_affine(point); // init empty buckets
}
buckets[write_bucket_index] = run_length ? bucket : P::zero();
}
template <typename P>
__global__ void
distribute_large_buckets_kernel(P* large_buckets, P* buckets, unsigned* single_bucket_indices, unsigned size)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= size) { return; }
buckets[single_bucket_indices[tid]] = large_buckets[tid];
}
// this kernel sums the entire bucket module
// each thread deals with a single bucket module
template <typename P>
__global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid >= nof_bms) return;
#ifdef SIGNED_DIG
unsigned buckets_in_bm = (1 << c) + 1;
#else
unsigned buckets_in_bm = (1 << c);
#endif
P line_sum = buckets[(tid + 1) * buckets_in_bm - 1];
final_sums[tid] = line_sum;
for (unsigned i = buckets_in_bm - 2; i > 0; i--) {
line_sum = line_sum + buckets[tid * buckets_in_bm + i]; // using the running sum method
final_sums[tid] = final_sums[tid] + line_sum;
}
}
// this kernel uses single scalar multiplication to multiply each bucket by its index
// each thread deals with a single bucket
template <typename P, typename S>
__global__ void ssm_buckets_kernel(P* buckets, unsigned* single_bucket_indices, unsigned nof_buckets, unsigned c)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid > nof_buckets) return;
unsigned bucket_index = single_bucket_indices[tid];
S scalar_bucket_multiplier;
scalar_bucket_multiplier = {
bucket_index & ((1 << c) - 1), 0, 0, 0, 0, 0, 0, 0}; // the index without the bucket module index
buckets[bucket_index] = scalar_bucket_multiplier * buckets[bucket_index];
}
template <typename P>
__global__ void last_pass_kernel(P* final_buckets, P* final_sums, unsigned num_sums)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid > num_sums) return;
final_sums[tid] = final_buckets[2 * tid + 1];
}
// this kernel computes the final result using the double and add algorithm
// it is done by a single thread
template <typename P, typename S>
__global__ void final_accumulation_kernel(
P* final_sums, P* ones_result, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c, bool add_ones)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid > nof_msms) return;
P final_result = P::zero();
for (unsigned i = nof_bms; i > 1; i--) {
final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
for (unsigned j = 0; j < c; j++) // double
{
final_result = final_result + final_result;
}
}
if (add_ones)
final_results[tid] = final_result + final_sums[tid * nof_bms] + ones_result[0];
else
final_results[tid] = final_result + final_sums[tid * nof_bms];
}
// this function computes msm using the bucket method
template <typename S, typename P, typename A>
void bucket_method_msm(
unsigned bitsize,
unsigned c,
S* scalars,
A* points,
unsigned size,
P* final_result,
bool on_device,
bool big_triangle,
unsigned large_bucket_factor,
cudaStream_t stream)
{
S* d_scalars;
A* d_points;
if (!on_device) {
// copy scalars and points to gpu
cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
cudaMallocAsync(&d_points, sizeof(A) * size, stream);
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
} else {
d_scalars = scalars;
d_points = points;
}
P* buckets;
// compute number of bucket modules and number of buckets in each module
unsigned nof_bms = (bitsize + c - 1) / c;
unsigned msm_log_size = ceil(log2(size));
unsigned bm_bitsize = ceil(log2(nof_bms));
#ifdef SIGNED_DIG
unsigned nof_buckets = nof_bms * ((1 << (c - 1)) + 1); // signed digits
#else
unsigned nof_buckets = nof_bms << c;
#endif
cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream);
// launch the bucket initialization kernel with maximum threads
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, nof_buckets);
// accumulate ones
P* ones_results; // fix whole division, in last run in kernel too
const unsigned nof_runs = msm_log_size > 10 ? (1 << (msm_log_size - 6)) : 16;
const unsigned run_length = (size + nof_runs - 1) / nof_runs;
cudaMallocAsync(&ones_results, sizeof(P) * nof_runs, stream);
NUM_THREADS = min(1 << 8, nof_runs);
NUM_BLOCKS = (nof_runs + NUM_THREADS - 1) / NUM_THREADS;
add_ones_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_points, d_scalars, ones_results, size, run_length);
for (int s = nof_runs >> 1; s > 0; s >>= 1) {
NUM_THREADS = min(MAX_TH, s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
ones_results, ones_results, s * 2, 0, 0, 0, s);
}
unsigned* bucket_indices;
unsigned* point_indices;
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms + 1), stream);
cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms + 1), stream);
// split scalars into digits
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (size * (nof_bms + 1) + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, nof_bms, bm_bitsize, c);
//+size - leaving the first bm free for the out of place sort later
// sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to
// each bucket
unsigned* sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
// info
cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices,
point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
for (unsigned i = 0; i < nof_bms; i++) {
unsigned offset_out = i * size;
unsigned offset_in = offset_out + size;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
// info
cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in,
bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, size, 0, sizeof(unsigned) * 8,
stream);
}
cudaFreeAsync(sort_indices_temp_storage, stream);
// find bucket_sizes
unsigned* single_bucket_indices;
unsigned* bucket_sizes;
unsigned* nof_buckets_to_compute;
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * nof_buckets, stream);
cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * nof_buckets, stream);
cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream);
unsigned* encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms * size, stream);
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
nof_buckets_to_compute, nof_bms * size, stream);
cudaFreeAsync(encode_temp_storage, stream);
// get offsets - where does each new bucket begin
unsigned* bucket_offsets;
cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * nof_buckets, stream);
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
cudaFreeAsync(offsets_temp_storage, stream);
// sort by bucket sizes
unsigned h_nof_buckets_to_compute;
cudaMemcpyAsync(&h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
// if all points are 0 just return point 0
if (h_nof_buckets_to_compute == 0) {
if (!on_device)
final_result[0] = P::zero();
else {
P* h_final_result = (P*)malloc(sizeof(P));
h_final_result[0] = P::zero();
cudaMemcpyAsync(final_result, h_final_result, sizeof(P), cudaMemcpyHostToDevice, stream);
}
return;
}
unsigned* sorted_bucket_sizes;
cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
unsigned* sorted_bucket_offsets;
cudaMallocAsync(&sorted_bucket_offsets, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
unsigned* sort_offsets_temp_storage{};
size_t sort_offsets_temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream);
cub::DeviceRadixSort::SortPairsDescending(
sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaFreeAsync(sort_offsets_temp_storage, stream);
unsigned* sorted_single_bucket_indices;
cudaMallocAsync(&sorted_single_bucket_indices, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
unsigned* sort_single_temp_storage{};
size_t sort_single_temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, single_bucket_indices,
sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream);
cub::DeviceRadixSort::SortPairsDescending(
sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, single_bucket_indices,
sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
cudaFreeAsync(sort_single_temp_storage, stream);
// find large buckets
unsigned avarage_size = size / (1 << c);
unsigned bucket_th = large_bucket_factor * avarage_size;
unsigned* nof_large_buckets;
cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream);
cudaMemset(nof_large_buckets, 0, sizeof(unsigned));
unsigned TOTAL_THREADS = 129000; // todo - device dependant
unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
NUM_THREADS = min(1 << 5, cutoff_nof_runs);
NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
find_cutoff_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, d_scalars, nof_large_buckets);
unsigned h_nof_large_buckets;
cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
unsigned* max_res;
cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream);
find_max_size<<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, d_scalars, max_res);
unsigned h_max_res[2];
cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream);
unsigned h_largest_bucket_size = h_max_res[0];
unsigned h_nof_zero_large_buckets = h_max_res[1];
unsigned large_buckets_to_compute =
h_nof_large_buckets > h_nof_zero_large_buckets ? h_nof_large_buckets - h_nof_zero_large_buckets : 0;
cudaStream_t stream2;
cudaStreamCreate(&stream2);
P* large_buckets;
if (large_buckets_to_compute > 0 && bucket_th > 0) {
unsigned threads_per_bucket =
1 << (unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); // global param
unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
unsigned total_large_buckets_size = large_buckets_to_compute * threads_per_bucket;
cudaMallocAsync(&large_buckets, sizeof(P) * total_large_buckets_size, stream);
NUM_THREADS = min(1 << 8, total_large_buckets_size);
NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
large_buckets, sorted_bucket_offsets + h_nof_zero_large_buckets, sorted_bucket_sizes + h_nof_zero_large_buckets,
sorted_single_bucket_indices + h_nof_zero_large_buckets, point_indices, d_points, nof_buckets,
large_buckets_to_compute, c + bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);
// reduce
for (int s = total_large_buckets_size >> 1; s > large_buckets_to_compute - 1; s >>= 1) {
NUM_THREADS = min(MAX_TH, s);
NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
large_buckets, large_buckets, s * 2, 0, 0, 0, s);
CHECK_LAST_CUDA_ERROR();
}
// distribute
NUM_THREADS = min(MAX_TH, large_buckets_to_compute);
NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS;
distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
large_buckets, buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, large_buckets_to_compute);
} else {
h_nof_large_buckets = 0;
}
// launch the accumulation kernel with maximum threads
if (h_nof_buckets_to_compute > h_nof_large_buckets) {
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets,
sorted_single_bucket_indices + h_nof_large_buckets, point_indices, d_points, nof_buckets,
h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
}
// all the large buckets need to be accumulated before the final summation
cudaStreamSynchronize(stream2);
cudaStreamDestroy(stream2);
#ifdef SSM_SUM
// sum each bucket
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
ssm_buckets_kernel<fake_point, fake_scalar>
<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
// sum each bucket module
P* final_results;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
NUM_THREADS = 1 << c;
NUM_BLOCKS = nof_bms;
sum_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results);
#endif
P* d_final_result;
if (!on_device) cudaMallocAsync(&d_final_result, sizeof(P), stream);
P* final_results;
if (big_triangle) {
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
// launch the bucket module sum kernel - a thread for each bucket module
NUM_THREADS = nof_bms;
NUM_BLOCKS = 1;
#ifdef SIGNED_DIG
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, final_results, nof_bms, c - 1); // sighed digits
#else
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
#endif
} else {
unsigned source_bits_count = c;
bool odd_source_c = source_bits_count % 2;
unsigned source_windows_count = nof_bms;
unsigned source_buckets_count = nof_buckets;
P* source_buckets = buckets;
buckets = nullptr;
P* target_buckets;
P* temp_buckets1;
P* temp_buckets2;
for (unsigned i = 0;; i++) {
const unsigned target_bits_count = (source_bits_count + 1) >> 1; // c/2=8
const unsigned target_windows_count = source_windows_count << 1; // nof bms*2 = 32
const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count, stream); // 32*2^8*2^7 buckets
cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count / 2, stream); // 32*2^8*2^7 buckets
cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count / 2, stream); // 32*2^8*2^7 buckets
if (source_bits_count > 0) {
for (unsigned j = 0; j < target_bits_count; j++) {
unsigned last_j = target_bits_count - 1;
unsigned nof_threads = (source_buckets_count >> (1 + j));
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1,
1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads);
NUM_THREADS = min(MAX_TH, nof_threads);
NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2,
1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads);
}
}
if (target_bits_count == 1) {
nof_bms = bitsize;
cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
NUM_THREADS = 32;
NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms);
c = 1;
cudaFreeAsync(source_buckets, stream);
cudaFreeAsync(target_buckets, stream);
cudaFreeAsync(temp_buckets1, stream);
cudaFreeAsync(temp_buckets2, stream);
break;
}
cudaFreeAsync(source_buckets, stream);
cudaFreeAsync(temp_buckets1, stream);
cudaFreeAsync(temp_buckets2, stream);
source_buckets = target_buckets;
target_buckets = nullptr;
temp_buckets1 = nullptr;
temp_buckets2 = nullptr;
source_bits_count = target_bits_count;
odd_source_c = source_bits_count % 2;
source_windows_count = target_windows_count;
source_buckets_count = target_buckets_count;
}
}
// launch the double and add kernel, a single thread
final_accumulation_kernel<P, S>
<<<1, 1, 0, stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c, true);
cudaFreeAsync(final_results, stream);
cudaStreamSynchronize(stream);
if (!on_device) cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
// free memory
if (!on_device) {
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_scalars, stream);
cudaFreeAsync(d_final_result, stream);
}
cudaFreeAsync(buckets, stream);
#ifndef PHASE1_TEST
cudaFreeAsync(bucket_indices, stream);
cudaFreeAsync(point_indices, stream);
cudaFreeAsync(single_bucket_indices, stream);
cudaFreeAsync(bucket_sizes, stream);
cudaFreeAsync(nof_buckets_to_compute, stream);
cudaFreeAsync(bucket_offsets, stream);
#endif
cudaFreeAsync(sorted_bucket_sizes, stream);
cudaFreeAsync(sorted_bucket_offsets, stream);
cudaFreeAsync(sorted_single_bucket_indices, stream);
cudaFreeAsync(nof_large_buckets, stream);
cudaFreeAsync(max_res, stream);
if (large_buckets_to_compute > 0 && bucket_th > 0) cudaFreeAsync(large_buckets, stream);
cudaFreeAsync(ones_results, stream);
cudaStreamSynchronize(stream);
}
// this function computes multiple msms using the bucket method
template <typename S, typename P, typename A>
void batched_bucket_method_msm(
unsigned bitsize,
unsigned c,
S* scalars,
A* points,
unsigned batch_size,
unsigned msm_size,
P* final_results,
bool on_device,
cudaStream_t stream)
{
unsigned total_size = batch_size * msm_size;
S* d_scalars;
A* d_points;
if (!on_device) {
// copy scalars and point to gpu
cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream);
cudaMallocAsync(&d_points, sizeof(A) * total_size, stream);
cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice, stream);
} else {
d_scalars = scalars;
d_points = points;
}
P* buckets;
// compute number of bucket modules and number of buckets in each module
unsigned nof_bms = (bitsize + c - 1) / c;
unsigned msm_log_size = ceil(log2(msm_size));
unsigned bm_bitsize = ceil(log2(nof_bms));
unsigned nof_buckets = (nof_bms << c);
unsigned total_nof_buckets = nof_buckets * batch_size;
cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream);
// lanch the bucket initialization kernel with maximum threads
unsigned NUM_THREADS = 1 << 10;
unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);
unsigned* bucket_indices;
unsigned* point_indices;
cudaMallocAsync(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
cudaMallocAsync(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
// split scalars into digits
NUM_THREADS = 1 << 10;
NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, msm_log_size, nof_bms, bm_bitsize, c);
//+msm_size - leaving the first bm free for the out of place sort later
// sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to
// each bucket
unsigned* sorted_bucket_indices;
unsigned* sorted_point_indices;
cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
unsigned* sort_indices_temp_storage{};
size_t sort_indices_temp_storage_bytes;
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
// info
cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream);
cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
// The second to last parameter is the default value supplied explicitly to allow passing the stream
// See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
// info
cub::DeviceRadixSort::SortPairs(
sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream);
cudaFreeAsync(sort_indices_temp_storage, stream);
// find bucket_sizes
unsigned* single_bucket_indices;
unsigned* bucket_sizes;
unsigned* total_nof_buckets_to_compute;
cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream);
cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream);
cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream);
unsigned* encode_temp_storage{};
size_t encode_temp_storage_bytes = 0;
cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
total_nof_buckets_to_compute, nof_bms * total_size, stream);
cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
cub::DeviceRunLengthEncode::Encode(
encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
total_nof_buckets_to_compute, nof_bms * total_size, stream);
cudaFreeAsync(encode_temp_storage, stream);
// get offsets - where does each new bucket begin
unsigned* bucket_offsets;
cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream);
unsigned* offsets_temp_storage{};
size_t offsets_temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
cub::DeviceScan::ExclusiveSum(
offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
cudaFreeAsync(offsets_temp_storage, stream);
unsigned h_nof_buckets_to_compute;
cudaMemcpyAsync(
&h_nof_buckets_to_compute, total_nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);
// launch the accumulation kernel with maximum threads
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, d_points, nof_buckets,
h_nof_buckets_to_compute, c + bm_bitsize, c);
// #ifdef SSM_SUM
// //sum each bucket
// NUM_THREADS = 1 << 10;
// NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
// ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);
// //sum each bucket module
// P* final_results;
// cudaMalloc(&final_results, sizeof(P) * nof_bms);
// NUM_THREADS = 1<<c;
// NUM_BLOCKS = nof_bms;
// sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
// #endif
// #ifdef BIG_TRIANGLE
P* bm_sums;
cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream);
// launch the bucket module sum kernel - a thread for each bucket module
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (nof_bms * batch_size + NUM_THREADS - 1) / NUM_THREADS;
big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bm_sums, nof_bms * batch_size, c);
// #endif
P* d_final_results;
if (!on_device) cudaMallocAsync(&d_final_results, sizeof(P) * batch_size, stream);
// launch the double and add kernel, a single thread for each msm
NUM_THREADS = 1 << 8;
NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
bm_sums, bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c, false);
// copy final result to host
if (!on_device)
cudaMemcpyAsync(final_results, d_final_results, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream);
// free memory
if (!on_device) {
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_scalars, stream);
cudaFreeAsync(d_final_results, stream);
}
cudaFreeAsync(buckets, stream);
cudaFreeAsync(bucket_indices, stream);
cudaFreeAsync(point_indices, stream);
cudaFreeAsync(sorted_bucket_indices, stream);
cudaFreeAsync(sorted_point_indices, stream);
cudaFreeAsync(single_bucket_indices, stream);
cudaFreeAsync(bucket_sizes, stream);
cudaFreeAsync(total_nof_buckets_to_compute, stream);
cudaFreeAsync(bucket_offsets, stream);
cudaFreeAsync(bm_sums, stream);
cudaStreamSynchronize(stream);
}
// this kernel converts affine points to projective points
// each thread deals with a single point
template <typename P, typename A>
__global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N)
{
unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid < N) proj_points[tid] = P::from_affine(affine_points[tid]);
}
// the function computes msm using ssm
template <typename S, typename P, typename A>
void short_msm(S* h_scalars, A* h_points, unsigned size, P* h_final_result, cudaStream_t stream)
{ // works up to 2^8
S* scalars;
A* a_points;
P* p_points;
P* results;
cudaMallocAsync(&scalars, sizeof(S) * size, stream);
cudaMallocAsync(&a_points, sizeof(A) * size, stream);
cudaMallocAsync(&p_points, sizeof(P) * size, stream);
cudaMallocAsync(&results, sizeof(P) * size, stream);
// copy inputs to device
cudaMemcpyAsync(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
// convert to projective representation and multiply each point by its scalar using single scalar multiplication
unsigned NUM_THREADS = size;
to_proj_kernel<<<1, NUM_THREADS, 0, stream>>>(a_points, p_points, size);
ssm_kernel<<<1, NUM_THREADS, 0, stream>>>(scalars, p_points, results, size);
P* final_result;
cudaMallocAsync(&final_result, sizeof(P), stream);
// assuming msm size is a power of 2
// sum all the ssm results
NUM_THREADS = size;
sum_reduction_kernel<<<1, NUM_THREADS, 0, stream>>>(results, final_result);
// copy result to host
cudaStreamSynchronize(stream);
cudaMemcpyAsync(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);
// free memory
cudaFreeAsync(scalars, stream);
cudaFreeAsync(a_points, stream);
cudaFreeAsync(p_points, stream);
cudaFreeAsync(results, stream);
cudaFreeAsync(final_result, stream);
}
// the function computes msm on the host using the naive method
template <typename A, typename S, typename P>
void reference_msm(S* scalars, A* a_points, unsigned size)
{
P* points = new P[size];
// P points[size];
for (unsigned i = 0; i < size; i++) {
points[i] = P::from_affine(a_points[i]);
}
P res = P::zero();
for (unsigned i = 0; i < size; i++) {
res = res + scalars[i] * points[i];
}
std::cout << "reference results" << std::endl;
std::cout << P::to_affine(res) << std::endl;
}
unsigned get_optimal_c(const unsigned size)
{
if (size < 17) return 1;
// return 17;
return ceil(log2(size)) - 4;
}
// this function is used to compute msms of size larger than 256
template <typename S, typename P, typename A>
void large_msm(
S* scalars,
A* points,
unsigned size,
P* result,
bool on_device,
bool big_triangle,
unsigned large_bucket_factor,
cudaStream_t stream)
{
unsigned c = 16;
unsigned bitsize = S::NBITS;
bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, big_triangle, large_bucket_factor, stream);
}
// this function is used to compute a batches of msms of size larger than 256 - currently isn't working on this branch
template <typename S, typename P, typename A>
void batched_large_msm(
S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream)
{
unsigned c = get_optimal_c(msm_size);
unsigned bitsize = S::NBITS;
batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream);
}
#endif