icicle/icicle/appUtils/msm/msm.cu

#ifndef MSM
#define MSM
#pragma once
#include "../../primitives/affine.cuh"
#include "../../primitives/field.cuh"
#include "../../primitives/projective.cuh"
#include "../../utils/cuda_utils.cuh"
#include "../../utils/error_handler.cuh"
#include "msm.cuh"
#include <cooperative_groups.h>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_run_length_encode.cuh>
#include <cub/device/device_scan.cuh>
#include <cuda.h>
#include <iostream>
#include <stdexcept>
#include <vector>

#define MAX_TH 256

// #define SIGNED_DIG //WIP
// #define BIG_TRIANGLE
// #define SSM_SUM  //WIP

template <typename P>
__global__ void single_stage_multi_reduction_kernel(
  P* v,
  P* v_r,
  unsigned block_size,
  unsigned write_stride,
  unsigned write_phase,
  unsigned padding,
  unsigned num_of_threads)
{
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid >= num_of_threads) { return; }

  int jump = block_size / 2;
  int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
  int block_id = tid_p / jump;
  int block_tid = tid_p % jump;
  unsigned read_ind = block_size * block_id + block_tid;
  unsigned write_ind = tid;
  unsigned v_r_key =
    write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride : write_ind;
  P v_r_value = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
                        : v[read_ind] + v[read_ind + jump];

  v_r[v_r_key] = v_r_value;
}

// this kernel performs single scalar multiplication
// each thread multilies a single scalar and point
template <typename P, typename S>
__global__ void ssm_kernel(S* scalars, P* points, P* results, unsigned N)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid < N) results[tid] = scalars[tid] * points[tid];
}

// this kernel sums all the elements in a given vector using multiple threads
template <typename P>
__global__ void sum_reduction_kernel(P* v, P* v_r)
{
  unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;

  // Start at 1/2 block stride and divide by two each iteration
  for (unsigned s = blockDim.x / 2; s > 0; s >>= 1) {
    // Each thread does work unless it is further than the stride
    if (threadIdx.x < s) { v[tid] = v[tid] + v[tid + s]; }
    __syncthreads();
  }

  // Let the thread 0 for this block write the final result
  if (threadIdx.x == 0) { v_r[blockIdx.x] = v[tid]; }
}

// this kernel initializes the buckets with zero points
// each thread initializes a different bucket
template <typename P>
__global__ void initialize_buckets_kernel(P* buckets, unsigned N)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid < N) buckets[tid] = P::zero(); // zero point
}

// this kernel splits the scalars into digits of size c
// each thread splits a single scalar into nof_bms digits
template <typename S>
__global__ void split_scalars_kernel(
  unsigned* buckets_indices,
  unsigned* point_indices,
  S* scalars,
  unsigned total_size,
  unsigned msm_log_size,
  unsigned nof_bms,
  unsigned bm_bitsize,
  unsigned c)
{
  constexpr unsigned sign_mask = 0x80000000;
  // constexpr unsigned trash_bucket = 0x80000000;
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  unsigned bucket_index;
  unsigned bucket_index2;
  unsigned current_index;
  unsigned msm_index = tid >> msm_log_size;
  unsigned borrow = 0;
  if (tid < total_size) {
    S scalar = scalars[tid];
    for (unsigned bm = 0; bm < nof_bms; bm++) {
      bucket_index = scalar.get_scalar_digit(bm, c);
#ifdef SIGNED_DIG
      bucket_index += borrow;
      borrow = 0;
      unsigned sign = 0;
      if (bucket_index > (1 << (c - 1))) {
        bucket_index = (1 << c) - bucket_index;
        borrow = 1;
        sign = sign_mask;
      }
#endif
      current_index = bm * total_size + tid;
#ifdef SIGNED_DIG
      point_indices[current_index] = sign | tid; // the point index is saved for later
#else
      buckets_indices[current_index] =
        (msm_index << (c + bm_bitsize)) | (bm << c) |
        bucket_index; // the bucket module number and the msm number are appended at the msbs
      if (scalar == S::zero() || scalar == S::one() || bucket_index == 0)
        buckets_indices[current_index] = 0; // will be skipped
      point_indices[current_index] = tid;   // the point index is saved for later
#endif
    }
  }
}

template <typename P, typename A, typename S>
__global__ void add_ones_kernel(A* points, S* scalars, P* results, const unsigned msm_size, const unsigned run_length)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  const unsigned nof_threads = (msm_size + run_length - 1) / run_length; // 129256
  if (tid >= nof_threads) {
    results[tid] = P::zero();
    return;
  }
  const unsigned start_index = tid * run_length;
  P sum = P::zero();
  for (int i = start_index; i < min(start_index + run_length, msm_size); i++) {
    if (scalars[i] == S::one()) sum = sum + points[i];
  }
  results[tid] = sum;
}

template <typename S>
__global__ void
find_cutoff_kernel(unsigned* v, unsigned size, unsigned cutoff, unsigned run_length, S* fake_param, unsigned* result)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  const unsigned nof_threads = (size + run_length - 1) / run_length;
  if (tid >= nof_threads) { return; }
  const unsigned start_index = tid * run_length;
  for (int i = start_index; i < min(start_index + run_length, size - 1); i++) {
    if (v[i] > cutoff && v[i + 1] <= cutoff) {
      result[0] = i + 1;
      return;
    }
  }
  if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; }
}

template <typename S>
__global__ void find_max_size(
  unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, S* fake_param, unsigned* largest_bucket_size)
{
  for (int i = 0;; i++) {
    if (single_bucket_indices[i] & ((1 << c) - 1)) {
      largest_bucket_size[0] = bucket_sizes[i];
      largest_bucket_size[1] = i;
      break;
    }
  }
}

// this kernel adds up the points in each bucket
//  __global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets,
//   unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__
//   point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){
template <typename P, typename A>
__global__ void accumulate_buckets_kernel(
  P* __restrict__ buckets,
  unsigned* __restrict__ bucket_offsets,
  unsigned* __restrict__ bucket_sizes,
  unsigned* __restrict__ single_bucket_indices,
  const unsigned* __restrict__ point_indices,
  A* __restrict__ points,
  const unsigned nof_buckets,
  const unsigned nof_buckets_to_compute,
  const unsigned msm_idx_shift,
  const unsigned c)
{
  constexpr unsigned sign_mask = 0x80000000;
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid >= nof_buckets_to_compute) return;
  if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) {
    return; // skip zero buckets
  }
#ifdef SIGNED_DIG // todo - fix
  const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
  const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c;
  const unsigned bucket_index =
    msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1));
#else
  unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
  unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
#endif
  const unsigned bucket_offset = bucket_offsets[tid];
  const unsigned bucket_size = bucket_sizes[tid];

  P bucket; // get rid of init buckets? no.. because what about buckets with no points
  for (unsigned i = 0; i < bucket_size;
       i++) { // add the relevant points starting from the relevant offset up to the bucket size
    unsigned point_ind = point_indices[bucket_offset + i];
#ifdef SIGNED_DIG
    unsigned sign = point_ind & sign_mask;
    point_ind &= ~sign_mask;
    A point = points[point_ind];
    if (sign) point = A::neg(point);
#else
    A point = points[point_ind];
#endif
    bucket = i ? bucket + point : P::from_affine(point);
  }
  buckets[bucket_index] = bucket;
}

template <typename P, typename A>
__global__ void accumulate_large_buckets_kernel(
  P* __restrict__ buckets,
  unsigned* __restrict__ bucket_offsets,
  unsigned* __restrict__ bucket_sizes,
  unsigned* __restrict__ single_bucket_indices,
  const unsigned* __restrict__ point_indices,
  A* __restrict__ points,
  const unsigned nof_buckets,
  const unsigned nof_buckets_to_compute,
  const unsigned msm_idx_shift,
  const unsigned c,
  const unsigned threads_per_bucket,
  const unsigned max_run_length)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  unsigned large_bucket_index = tid / threads_per_bucket;
  unsigned bucket_segment_index = tid % threads_per_bucket;
  if (tid >= nof_buckets_to_compute * threads_per_bucket) { return; }
  if ((single_bucket_indices[large_bucket_index] & ((1 << c) - 1)) == 0) { // dont need
    return;                                                                // skip zero buckets
  }
  unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index;
  const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index * max_run_length;
  const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index * max_run_length
                                 ? bucket_sizes[large_bucket_index] - bucket_segment_index * max_run_length
                                 : 0;
  P bucket;
  unsigned run_length = min(bucket_size, max_run_length);
  for (unsigned i = 0; i < run_length;
       i++) { // add the relevant points starting from the relevant offset up to the bucket size
    unsigned point_ind = point_indices[bucket_offset + i];
    A point = points[point_ind];
    bucket = i ? bucket + point : P::from_affine(point); // init empty buckets
  }
  buckets[write_bucket_index] = run_length ? bucket : P::zero();
}

template <typename P>
__global__ void
distribute_large_buckets_kernel(P* large_buckets, P* buckets, unsigned* single_bucket_indices, unsigned size)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid >= size) { return; }
  buckets[single_bucket_indices[tid]] = large_buckets[tid];
}

// this kernel sums the entire bucket module
// each thread deals with a single bucket module
template <typename P>
__global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid >= nof_bms) return;
#ifdef SIGNED_DIG
  unsigned buckets_in_bm = (1 << c) + 1;
#else
  unsigned buckets_in_bm = (1 << c);
#endif
  P line_sum = buckets[(tid + 1) * buckets_in_bm - 1];
  final_sums[tid] = line_sum;
  for (unsigned i = buckets_in_bm - 2; i > 0; i--) {
    line_sum = line_sum + buckets[tid * buckets_in_bm + i]; // using the running sum method
    final_sums[tid] = final_sums[tid] + line_sum;
  }
}

// this kernel uses single scalar multiplication to multiply each bucket by its index
// each thread deals with a single bucket
template <typename P, typename S>
__global__ void ssm_buckets_kernel(P* buckets, unsigned* single_bucket_indices, unsigned nof_buckets, unsigned c)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid > nof_buckets) return;
  unsigned bucket_index = single_bucket_indices[tid];
  S scalar_bucket_multiplier;
  scalar_bucket_multiplier = {
    bucket_index & ((1 << c) - 1), 0, 0, 0, 0, 0, 0, 0}; // the index without the bucket module index
  buckets[bucket_index] = scalar_bucket_multiplier * buckets[bucket_index];
}

template <typename P>
__global__ void last_pass_kernel(P* final_buckets, P* final_sums, unsigned num_sums)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid > num_sums) return;
  final_sums[tid] = final_buckets[2 * tid + 1];
}

// this kernel computes the final result using the double and add algorithm
// it is done by a single thread
template <typename P, typename S>
__global__ void final_accumulation_kernel(
  P* final_sums, P* ones_result, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c, bool add_ones)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid > nof_msms) return;
  P final_result = P::zero();
  for (unsigned i = nof_bms; i > 1; i--) {
    final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
    for (unsigned j = 0; j < c; j++)                                 // double
    {
      final_result = final_result + final_result;
    }
  }
  if (add_ones)
    final_results[tid] = final_result + final_sums[tid * nof_bms] + ones_result[0];
  else
    final_results[tid] = final_result + final_sums[tid * nof_bms];
}

// this function computes msm using the bucket method
template <typename S, typename P, typename A>
void bucket_method_msm(
  unsigned bitsize,
  unsigned c,
  S* scalars,
  A* points,
  unsigned size,
  P* final_result,
  bool on_device,
  bool big_triangle,
  unsigned large_bucket_factor,
  cudaStream_t stream)
{
  S* d_scalars;
  A* d_points;
  if (!on_device) {
    // copy scalars and points to gpu
    cudaMallocAsync(&d_scalars, sizeof(S) * size, stream);
    cudaMallocAsync(&d_points, sizeof(A) * size, stream);
    cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_points, points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);
  } else {
    d_scalars = scalars;
    d_points = points;
  }

  P* buckets;
  // compute number of bucket modules and number of buckets in each module
  unsigned nof_bms = (bitsize + c - 1) / c;
  unsigned msm_log_size = ceil(log2(size));
  unsigned bm_bitsize = ceil(log2(nof_bms));
#ifdef SIGNED_DIG
  unsigned nof_buckets = nof_bms * ((1 << (c - 1)) + 1); // signed digits
#else
  unsigned nof_buckets = nof_bms << c;
#endif
  cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream);

  // launch the bucket initialization kernel with maximum threads
  unsigned NUM_THREADS = 1 << 10;
  unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
  initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, nof_buckets);

  // accumulate ones
  P* ones_results; // fix whole division, in last run in kernel too
  const unsigned nof_runs = msm_log_size > 10 ? (1 << (msm_log_size - 6)) : 16;
  const unsigned run_length = (size + nof_runs - 1) / nof_runs;
  cudaMallocAsync(&ones_results, sizeof(P) * nof_runs, stream);
  NUM_THREADS = min(1 << 8, nof_runs);
  NUM_BLOCKS = (nof_runs + NUM_THREADS - 1) / NUM_THREADS;
  add_ones_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_points, d_scalars, ones_results, size, run_length);

  for (int s = nof_runs >> 1; s > 0; s >>= 1) {
    NUM_THREADS = min(MAX_TH, s);
    NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
    single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
      ones_results, ones_results, s * 2, 0, 0, 0, s);
  }

  unsigned* bucket_indices;
  unsigned* point_indices;
  cudaMallocAsync(&bucket_indices, sizeof(unsigned) * size * (nof_bms + 1), stream);
  cudaMallocAsync(&point_indices, sizeof(unsigned) * size * (nof_bms + 1), stream);

  // split scalars into digits
  NUM_THREADS = 1 << 10;
  NUM_BLOCKS = (size * (nof_bms + 1) + NUM_THREADS - 1) / NUM_THREADS;
  split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
    bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, nof_bms, bm_bitsize, c);
  //+size - leaving the first bm free for the out of place sort later

  // sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to
  // each bucket
  unsigned* sort_indices_temp_storage{};
  size_t sort_indices_temp_storage_bytes;
  // The second to last parameter is the default value supplied explicitly to allow passing the stream
  // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
  // info
  cub::DeviceRadixSort::SortPairs(
    sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + size, bucket_indices,
    point_indices + size, point_indices, size, 0, sizeof(unsigned) * 8, stream);
  cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
  for (unsigned i = 0; i < nof_bms; i++) {
    unsigned offset_out = i * size;
    unsigned offset_in = offset_out + size;
    // The second to last parameter is the default value supplied explicitly to allow passing the stream
    // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
    // info
    cub::DeviceRadixSort::SortPairs(
      sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in,
      bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, size, 0, sizeof(unsigned) * 8,
      stream);
  }
  cudaFreeAsync(sort_indices_temp_storage, stream);

  // find bucket_sizes
  unsigned* single_bucket_indices;
  unsigned* bucket_sizes;
  unsigned* nof_buckets_to_compute;
  cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * nof_buckets, stream);
  cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * nof_buckets, stream);
  cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream);
  unsigned* encode_temp_storage{};
  size_t encode_temp_storage_bytes = 0;
  cub::DeviceRunLengthEncode::Encode(
    encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
    nof_buckets_to_compute, nof_bms * size, stream);
  cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
  cub::DeviceRunLengthEncode::Encode(
    encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
    nof_buckets_to_compute, nof_bms * size, stream);
  cudaFreeAsync(encode_temp_storage, stream);

  // get offsets - where does each new bucket begin
  unsigned* bucket_offsets;
  cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * nof_buckets, stream);
  unsigned* offsets_temp_storage{};
  size_t offsets_temp_storage_bytes = 0;
  cub::DeviceScan::ExclusiveSum(
    offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
  cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
  cub::DeviceScan::ExclusiveSum(
    offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
  cudaFreeAsync(offsets_temp_storage, stream);

  // sort by bucket sizes
  unsigned h_nof_buckets_to_compute;
  cudaMemcpyAsync(&h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);

  // if all points are 0 just return point 0
  if (h_nof_buckets_to_compute == 0) {
    if (!on_device)
      final_result[0] = P::zero();
    else {
      P* h_final_result = (P*)malloc(sizeof(P));
      h_final_result[0] = P::zero();
      cudaMemcpyAsync(final_result, h_final_result, sizeof(P), cudaMemcpyHostToDevice, stream);
    }

    return;
  }

  unsigned* sorted_bucket_sizes;
  cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
  unsigned* sorted_bucket_offsets;
  cudaMallocAsync(&sorted_bucket_offsets, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
  unsigned* sort_offsets_temp_storage{};
  size_t sort_offsets_temp_storage_bytes = 0;
  cub::DeviceRadixSort::SortPairsDescending(
    sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
    sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
  cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream);
  cub::DeviceRadixSort::SortPairsDescending(
    sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
    sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
  cudaFreeAsync(sort_offsets_temp_storage, stream);

  unsigned* sorted_single_bucket_indices;
  cudaMallocAsync(&sorted_single_bucket_indices, sizeof(unsigned) * h_nof_buckets_to_compute, stream);
  unsigned* sort_single_temp_storage{};
  size_t sort_single_temp_storage_bytes = 0;
  cub::DeviceRadixSort::SortPairsDescending(
    sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, single_bucket_indices,
    sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
  cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream);
  cub::DeviceRadixSort::SortPairsDescending(
    sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, single_bucket_indices,
    sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream);
  cudaFreeAsync(sort_single_temp_storage, stream);

  // find large buckets
  unsigned avarage_size = size / (1 << c);
  unsigned bucket_th = large_bucket_factor * avarage_size;
  unsigned* nof_large_buckets;
  cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream);
  cudaMemset(nof_large_buckets, 0, sizeof(unsigned));

  unsigned TOTAL_THREADS = 129000; // todo - device dependant
  unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
  unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
  NUM_THREADS = min(1 << 5, cutoff_nof_runs);
  NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
  find_cutoff_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
    sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, d_scalars, nof_large_buckets);

  unsigned h_nof_large_buckets;
  cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);

  unsigned* max_res;
  cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream);
  find_max_size<<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, d_scalars, max_res);

  unsigned h_max_res[2];
  cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream);
  unsigned h_largest_bucket_size = h_max_res[0];
  unsigned h_nof_zero_large_buckets = h_max_res[1];
  unsigned large_buckets_to_compute =
    h_nof_large_buckets > h_nof_zero_large_buckets ? h_nof_large_buckets - h_nof_zero_large_buckets : 0;

  cudaStream_t stream2;
  cudaStreamCreate(&stream2);
  P* large_buckets;

  if (large_buckets_to_compute > 0 && bucket_th > 0) {
    unsigned threads_per_bucket =
      1 << (unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); // global param
    unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
    unsigned total_large_buckets_size = large_buckets_to_compute * threads_per_bucket;
    cudaMallocAsync(&large_buckets, sizeof(P) * total_large_buckets_size, stream);

    NUM_THREADS = min(1 << 8, total_large_buckets_size);
    NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
    accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
      large_buckets, sorted_bucket_offsets + h_nof_zero_large_buckets, sorted_bucket_sizes + h_nof_zero_large_buckets,
      sorted_single_bucket_indices + h_nof_zero_large_buckets, point_indices, d_points, nof_buckets,
      large_buckets_to_compute, c + bm_bitsize, c, threads_per_bucket, max_bucket_size_run_length);

    // reduce
    for (int s = total_large_buckets_size >> 1; s > large_buckets_to_compute - 1; s >>= 1) {
      NUM_THREADS = min(MAX_TH, s);
      NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
      single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
        large_buckets, large_buckets, s * 2, 0, 0, 0, s);

      CHECK_LAST_CUDA_ERROR();
    }

    // distribute
    NUM_THREADS = min(MAX_TH, large_buckets_to_compute);
    NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS;
    distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream2>>>(
      large_buckets, buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, large_buckets_to_compute);
  } else {
    h_nof_large_buckets = 0;
  }

  // launch the accumulation kernel with maximum threads
  if (h_nof_buckets_to_compute > h_nof_large_buckets) {
    NUM_THREADS = 1 << 8;
    NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
    accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
      buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets,
      sorted_single_bucket_indices + h_nof_large_buckets, point_indices, d_points, nof_buckets,
      h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
  }

  // all the large buckets need to be accumulated before the final summation
  cudaStreamSynchronize(stream2);
  cudaStreamDestroy(stream2);

#ifdef SSM_SUM
  // sum each bucket
  NUM_THREADS = 1 << 10;
  NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
  ssm_buckets_kernel<fake_point, fake_scalar>
    <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);

  // sum each bucket module
  P* final_results;
  cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
  NUM_THREADS = 1 << c;
  NUM_BLOCKS = nof_bms;
  sum_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results);
#endif

  P* d_final_result;
  if (!on_device) cudaMallocAsync(&d_final_result, sizeof(P), stream);

  P* final_results;
  if (big_triangle) {
    cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
    // launch the bucket module sum kernel - a thread for each bucket module
    NUM_THREADS = nof_bms;
    NUM_BLOCKS = 1;
#ifdef SIGNED_DIG
    big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
      buckets, final_results, nof_bms, c - 1); // sighed digits
#else
    big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
#endif
  } else {
    unsigned source_bits_count = c;
    bool odd_source_c = source_bits_count % 2;
    unsigned source_windows_count = nof_bms;
    unsigned source_buckets_count = nof_buckets;
    P* source_buckets = buckets;
    buckets = nullptr;
    P* target_buckets;
    P* temp_buckets1;
    P* temp_buckets2;
    for (unsigned i = 0;; i++) {
      const unsigned target_bits_count = (source_bits_count + 1) >> 1;                 // c/2=8
      const unsigned target_windows_count = source_windows_count << 1;                 // nof bms*2 = 32
      const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
      cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count, stream);      // 32*2^8*2^7 buckets
      cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count / 2, stream);   // 32*2^8*2^7 buckets
      cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count / 2, stream);   // 32*2^8*2^7 buckets

      if (source_bits_count > 0) {
        for (unsigned j = 0; j < target_bits_count; j++) {
          unsigned last_j = target_bits_count - 1;
          unsigned nof_threads = (source_buckets_count >> (1 + j));
          NUM_THREADS = min(MAX_TH, nof_threads);
          NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
          single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
            j == 0 ? source_buckets : temp_buckets1, j == target_bits_count - 1 ? target_buckets : temp_buckets1,
            1 << (source_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 0, 0, nof_threads);

          NUM_THREADS = min(MAX_TH, nof_threads);
          NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
          single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
            j == 0 ? source_buckets : temp_buckets2, j == target_bits_count - 1 ? target_buckets : temp_buckets2,
            1 << (target_bits_count - j), j == target_bits_count - 1 ? 1 << target_bits_count : 0, 1, 0, nof_threads);
        }
      }
      if (target_bits_count == 1) {
        nof_bms = bitsize;
        cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
        NUM_THREADS = 32;
        NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
        last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms);
        c = 1;
        cudaFreeAsync(source_buckets, stream);
        cudaFreeAsync(target_buckets, stream);
        cudaFreeAsync(temp_buckets1, stream);
        cudaFreeAsync(temp_buckets2, stream);
        break;
      }
      cudaFreeAsync(source_buckets, stream);
      cudaFreeAsync(temp_buckets1, stream);
      cudaFreeAsync(temp_buckets2, stream);
      source_buckets = target_buckets;
      target_buckets = nullptr;
      temp_buckets1 = nullptr;
      temp_buckets2 = nullptr;
      source_bits_count = target_bits_count;
      odd_source_c = source_bits_count % 2;
      source_windows_count = target_windows_count;
      source_buckets_count = target_buckets_count;
    }
  }

  // launch the double and add kernel, a single thread
  final_accumulation_kernel<P, S>
    <<<1, 1, 0, stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c, true);
  cudaFreeAsync(final_results, stream);
  cudaStreamSynchronize(stream);

  if (!on_device) cudaMemcpyAsync(final_result, d_final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);

  // free memory
  if (!on_device) {
    cudaFreeAsync(d_points, stream);
    cudaFreeAsync(d_scalars, stream);
    cudaFreeAsync(d_final_result, stream);
  }
  cudaFreeAsync(buckets, stream);
#ifndef PHASE1_TEST
  cudaFreeAsync(bucket_indices, stream);
  cudaFreeAsync(point_indices, stream);
  cudaFreeAsync(single_bucket_indices, stream);
  cudaFreeAsync(bucket_sizes, stream);
  cudaFreeAsync(nof_buckets_to_compute, stream);
  cudaFreeAsync(bucket_offsets, stream);
#endif
  cudaFreeAsync(sorted_bucket_sizes, stream);
  cudaFreeAsync(sorted_bucket_offsets, stream);
  cudaFreeAsync(sorted_single_bucket_indices, stream);
  cudaFreeAsync(nof_large_buckets, stream);
  cudaFreeAsync(max_res, stream);
  if (large_buckets_to_compute > 0 && bucket_th > 0) cudaFreeAsync(large_buckets, stream);
  cudaFreeAsync(ones_results, stream);

  cudaStreamSynchronize(stream);
}

// this function computes multiple msms using the bucket method
template <typename S, typename P, typename A>
void batched_bucket_method_msm(
  unsigned bitsize,
  unsigned c,
  S* scalars,
  A* points,
  unsigned batch_size,
  unsigned msm_size,
  P* final_results,
  bool on_device,
  cudaStream_t stream)
{
  unsigned total_size = batch_size * msm_size;
  S* d_scalars;
  A* d_points;
  if (!on_device) {
    // copy scalars and point to gpu
    cudaMallocAsync(&d_scalars, sizeof(S) * total_size, stream);
    cudaMallocAsync(&d_points, sizeof(A) * total_size, stream);
    cudaMemcpyAsync(d_scalars, scalars, sizeof(S) * total_size, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_points, points, sizeof(A) * total_size, cudaMemcpyHostToDevice, stream);
  } else {
    d_scalars = scalars;
    d_points = points;
  }

  P* buckets;
  // compute number of bucket modules and number of buckets in each module
  unsigned nof_bms = (bitsize + c - 1) / c;
  unsigned msm_log_size = ceil(log2(msm_size));
  unsigned bm_bitsize = ceil(log2(nof_bms));
  unsigned nof_buckets = (nof_bms << c);
  unsigned total_nof_buckets = nof_buckets * batch_size;
  cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream);

  // lanch the bucket initialization kernel with maximum threads
  unsigned NUM_THREADS = 1 << 10;
  unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
  initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);

  unsigned* bucket_indices;
  unsigned* point_indices;
  cudaMallocAsync(&bucket_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);
  cudaMallocAsync(&point_indices, sizeof(unsigned) * (total_size * nof_bms + msm_size), stream);

  // split scalars into digits
  NUM_THREADS = 1 << 10;
  NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
  split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
    bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, msm_log_size, nof_bms, bm_bitsize, c);
  //+msm_size - leaving the first bm free for the out of place sort later

  // sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to
  // each bucket
  unsigned* sorted_bucket_indices;
  unsigned* sorted_point_indices;
  cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * (total_size * nof_bms), stream);
  cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * (total_size * nof_bms), stream);

  unsigned* sort_indices_temp_storage{};
  size_t sort_indices_temp_storage_bytes;
  // The second to last parameter is the default value supplied explicitly to allow passing the stream
  // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
  // info
  cub::DeviceRadixSort::SortPairs(
    sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
    point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream);
  cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream);
  // The second to last parameter is the default value supplied explicitly to allow passing the stream
  // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for more
  // info
  cub::DeviceRadixSort::SortPairs(
    sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + msm_size, sorted_bucket_indices,
    point_indices + msm_size, sorted_point_indices, total_size * nof_bms, 0, sizeof(unsigned) * 8, stream);
  cudaFreeAsync(sort_indices_temp_storage, stream);

  // find bucket_sizes
  unsigned* single_bucket_indices;
  unsigned* bucket_sizes;
  unsigned* total_nof_buckets_to_compute;
  cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream);
  cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream);
  cudaMallocAsync(&total_nof_buckets_to_compute, sizeof(unsigned), stream);
  unsigned* encode_temp_storage{};
  size_t encode_temp_storage_bytes = 0;
  cub::DeviceRunLengthEncode::Encode(
    encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
    total_nof_buckets_to_compute, nof_bms * total_size, stream);
  cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream);
  cub::DeviceRunLengthEncode::Encode(
    encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
    total_nof_buckets_to_compute, nof_bms * total_size, stream);
  cudaFreeAsync(encode_temp_storage, stream);

  // get offsets - where does each new bucket begin
  unsigned* bucket_offsets;
  cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream);
  unsigned* offsets_temp_storage{};
  size_t offsets_temp_storage_bytes = 0;
  cub::DeviceScan::ExclusiveSum(
    offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
  cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream);
  cub::DeviceScan::ExclusiveSum(
    offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream);
  cudaFreeAsync(offsets_temp_storage, stream);

  unsigned h_nof_buckets_to_compute;
  cudaMemcpyAsync(
    &h_nof_buckets_to_compute, total_nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream);

  // launch the accumulation kernel with maximum threads
  NUM_THREADS = 1 << 8;
  NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
  accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
    buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices, d_points, nof_buckets,
    h_nof_buckets_to_compute, c + bm_bitsize, c);

  // #ifdef SSM_SUM
  //   //sum each bucket
  //   NUM_THREADS = 1 << 10;
  //   NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
  //   ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);

  //   //sum each bucket module
  //   P* final_results;
  //   cudaMalloc(&final_results, sizeof(P) * nof_bms);
  //   NUM_THREADS = 1<<c;
  //   NUM_BLOCKS = nof_bms;
  //   sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
  // #endif

  // #ifdef BIG_TRIANGLE
  P* bm_sums;
  cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream);
  // launch the bucket module sum kernel - a thread for each bucket module
  NUM_THREADS = 1 << 8;
  NUM_BLOCKS = (nof_bms * batch_size + NUM_THREADS - 1) / NUM_THREADS;
  big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bm_sums, nof_bms * batch_size, c);
  // #endif

  P* d_final_results;
  if (!on_device) cudaMallocAsync(&d_final_results, sizeof(P) * batch_size, stream);

  // launch the double and add kernel, a single thread for each msm
  NUM_THREADS = 1 << 8;
  NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
  final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
    bm_sums, bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c, false);

  // copy final result to host
  if (!on_device)
    cudaMemcpyAsync(final_results, d_final_results, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream);

  // free memory
  if (!on_device) {
    cudaFreeAsync(d_points, stream);
    cudaFreeAsync(d_scalars, stream);
    cudaFreeAsync(d_final_results, stream);
  }
  cudaFreeAsync(buckets, stream);
  cudaFreeAsync(bucket_indices, stream);
  cudaFreeAsync(point_indices, stream);
  cudaFreeAsync(sorted_bucket_indices, stream);
  cudaFreeAsync(sorted_point_indices, stream);
  cudaFreeAsync(single_bucket_indices, stream);
  cudaFreeAsync(bucket_sizes, stream);
  cudaFreeAsync(total_nof_buckets_to_compute, stream);
  cudaFreeAsync(bucket_offsets, stream);
  cudaFreeAsync(bm_sums, stream);

  cudaStreamSynchronize(stream);
}

// this kernel converts affine points to projective points
// each thread deals with a single point
template <typename P, typename A>
__global__ void to_proj_kernel(A* affine_points, P* proj_points, unsigned N)
{
  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid < N) proj_points[tid] = P::from_affine(affine_points[tid]);
}

// the function computes msm using ssm
template <typename S, typename P, typename A>
void short_msm(S* h_scalars, A* h_points, unsigned size, P* h_final_result, cudaStream_t stream)
{ // works up to 2^8
  S* scalars;
  A* a_points;
  P* p_points;
  P* results;

  cudaMallocAsync(&scalars, sizeof(S) * size, stream);
  cudaMallocAsync(&a_points, sizeof(A) * size, stream);
  cudaMallocAsync(&p_points, sizeof(P) * size, stream);
  cudaMallocAsync(&results, sizeof(P) * size, stream);

  // copy inputs to device
  cudaMemcpyAsync(scalars, h_scalars, sizeof(S) * size, cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(a_points, h_points, sizeof(A) * size, cudaMemcpyHostToDevice, stream);

  // convert to projective representation and multiply each point by its scalar using single scalar multiplication
  unsigned NUM_THREADS = size;
  to_proj_kernel<<<1, NUM_THREADS, 0, stream>>>(a_points, p_points, size);
  ssm_kernel<<<1, NUM_THREADS, 0, stream>>>(scalars, p_points, results, size);

  P* final_result;
  cudaMallocAsync(&final_result, sizeof(P), stream);

  // assuming msm size is a power of 2
  // sum all the ssm results
  NUM_THREADS = size;
  sum_reduction_kernel<<<1, NUM_THREADS, 0, stream>>>(results, final_result);

  // copy result to host
  cudaStreamSynchronize(stream);
  cudaMemcpyAsync(h_final_result, final_result, sizeof(P), cudaMemcpyDeviceToHost, stream);

  // free memory
  cudaFreeAsync(scalars, stream);
  cudaFreeAsync(a_points, stream);
  cudaFreeAsync(p_points, stream);
  cudaFreeAsync(results, stream);
  cudaFreeAsync(final_result, stream);
}

// the function computes msm on the host using the naive method
template <typename A, typename S, typename P>
void reference_msm(S* scalars, A* a_points, unsigned size)
{
  P* points = new P[size];
  // P points[size];
  for (unsigned i = 0; i < size; i++) {
    points[i] = P::from_affine(a_points[i]);
  }

  P res = P::zero();

  for (unsigned i = 0; i < size; i++) {
    res = res + scalars[i] * points[i];
  }

  std::cout << "reference results" << std::endl;
  std::cout << P::to_affine(res) << std::endl;
}

unsigned get_optimal_c(const unsigned size)
{
  if (size < 17) return 1;
  // return 17;
  return ceil(log2(size)) - 4;
}

// this function is used to compute msms of size larger than 256
template <typename S, typename P, typename A>
void large_msm(
  S* scalars,
  A* points,
  unsigned size,
  P* result,
  bool on_device,
  bool big_triangle,
  unsigned large_bucket_factor,
  cudaStream_t stream)
{
  unsigned c = 16;
  unsigned bitsize = S::NBITS;
  bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, big_triangle, large_bucket_factor, stream);
}

// this function is used to compute a batches of msms of size larger than 256 - currently isn't working on this branch
template <typename S, typename P, typename A>
void batched_large_msm(
  S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream)
{
  unsigned c = get_optimal_c(msm_size);
  unsigned bitsize = S::NBITS;
  batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream);
}
#endif