Merge branch 'dev' into temp/stas/muli-gpu-example

2026-01-08 23:17:54 -05:00 · 2024-02-20 19:07:48 -06:00
parent b1af193f6f 49c7fb0db1
commit d1f19af64d
15 changed files with 374 additions and 273 deletions
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -16,10 +16,11 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
 // Include NTT template
 #include "appUtils/ntt/ntt.cu"
 using namespace curve_config;
+using namespace ntt;
 // Configure NTT
-ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
+NTTConfig<S> config=DefaultNTTConfig<S>();
 // Call NTT
-ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
+NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
 ```

 ## Running the example
@@ -28,5 +29,10 @@ ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
 - compile with  `./compile.sh`
 - run with `./run.sh`

+## What's in the example

-
+1. Define the size of the example
+2. Initialize input
+3. Run Radix2 NTT
+4. Run MixedRadix NTT
+5. Validate the data output
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -7,6 +7,7 @@
 #include "appUtils/ntt/ntt.cu"
 #include "appUtils/ntt/kernel_ntt.cu"
 using namespace curve_config;
+using namespace ntt;

 // Operate on scalars
 typedef scalar_t S;
@@ -58,6 +59,11 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
  return nof_errors;
 }

+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
 int main(int argc, char* argv[])
 {
  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
@@ -78,24 +84,30 @@ int main(int argc, char* argv[])
  output = (E*)malloc(sizeof(E) * batch_size);

  std::cout << "Running NTT with on-host data" << std::endl;
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
  // Create a device context
  auto ctx = device_context::get_default_device_context();
-  // the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
-  S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
-  ntt::InitDomain(rou, ctx);
+  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
+  InitDomain(basic_root, ctx);
  // Create an NTTConfig instance
-  ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
+  NTTConfig<S> config = DefaultNTTConfig<S>();
+  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
  config.batch_size = nof_ntts;
-  config.ctx.stream = stream;
-  auto begin0 = std::chrono::high_resolution_clock::now();
-  cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
-  auto end0 = std::chrono::high_resolution_clock::now();
-  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
-  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
+  START_TIMER(MixedRadix);
+  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(MixedRadix, "MixedRadix NTT");
+  
+  std::cout << "Validating output" << std::endl;
  validate_output(ntt_size, nof_ntts, output);
-  cudaStreamDestroy(stream);
+
+  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  START_TIMER(Radix2);
+  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(Radix2, "Radix2 NTT");
+
+  std::cout << "Validating output" << std::endl;
+  validate_output(ntt_size, nof_ntts, output);
+
+  std::cout << "Cleaning-up memory" << std::endl;
  free(input);
  free(output);
  return 0;
--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
@@ -1,4 +1,4 @@
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -I. -I../.. tests/msm_test.cu
+	nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
 	work/test_msm
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -30,6 +30,53 @@ namespace msm {

    unsigned get_optimal_c(int bitsize) { return max((unsigned)ceil(log2(bitsize)) - 4, 1U); }

+    template <typename E>
+    __global__ void normalize_kernel(E* inout, E factor, int n)
+    {
+      int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      if (tid < n) inout[tid] = (inout[tid] + factor - 1) / factor;
+    }
+
+    // a kernel that writes to bucket_indices which enables large bucket accumulation to happen afterwards.
+    // specifically we map thread indices to buckets which said threads will handle in accumulation.
+    template <typename P>
+    __global__ void initialize_large_bucket_indices(
+      unsigned* sorted_bucket_sizes_sum,
+      unsigned nof_pts_per_thread,
+      unsigned nof_large_buckets,
+      // log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute))
+      unsigned log_nof_large_buckets,
+      unsigned* bucket_indices)
+    {
+      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      if (tid >= nof_large_buckets) { return; }
+      unsigned start = (sorted_bucket_sizes_sum[tid] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid;
+      unsigned end = (sorted_bucket_sizes_sum[tid + 1] + nof_pts_per_thread - 1) / nof_pts_per_thread + tid + 1;
+      for (unsigned i = start; i < end; i++) {
+        // this just concatenates two pieces of data - large bucket index and (i - start)
+        bucket_indices[i] = tid | ((i - start) << log_nof_large_buckets);
+      }
+    }
+
+    // this function provides a single step of reduction across buckets sizes of
+    // which are given by large_bucket_sizes pointer
+    template <typename P>
+    __global__ void sum_reduction_variable_size_kernel(
+      P* v,
+      unsigned* bucket_sizes_sum,
+      unsigned* bucket_sizes,
+      unsigned* large_bucket_thread_indices,
+      unsigned num_of_threads)
+    {
+      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      if (tid >= num_of_threads) { return; }
+
+      unsigned large_bucket_tid = large_bucket_thread_indices[tid];
+      unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
+      unsigned large_bucket_size = bucket_sizes[large_bucket_tid];
+      if (segment_ind < (large_bucket_size >> 1)) { v[tid] = v[tid] + v[tid + ((large_bucket_size + 1) >> 1)]; }
+    }
+
    template <typename P>
    __global__ void single_stage_multi_reduction_kernel(
      const P* v,
@@ -37,23 +84,27 @@ namespace msm {
      unsigned block_size,
      unsigned write_stride,
      unsigned write_phase,
-      unsigned padding,
+      unsigned step,
      unsigned num_of_threads)
    {
      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
      if (tid >= num_of_threads) { return; }

+      // we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
+      // for write_phase==1, the read pattern is different so we don't skip over anything.
+      const int shifted_tid = write_phase ? tid : tid + (tid + step) / step;
      const int jump = block_size / 2;
-      const int tid_p = padding ? (tid / (2 * padding)) * padding + tid % padding : tid;
-      const int block_id = tid_p / jump;
-      const int block_tid = tid_p % jump;
-      const unsigned read_ind = block_size * block_id + block_tid;
-      const unsigned write_ind = tid;
+      const int block_id = shifted_tid / jump;
+      // here the reason for shifting is the same as for shifted_tid but we skip over entire blocks which happens
+      // only for write_phase=1 because of its read pattern.
+      const int shifted_block_id = write_phase ? block_id + (block_id + step) / step : block_id;
+      const int block_tid = shifted_tid % jump;
+      const unsigned read_ind = block_size * shifted_block_id + block_tid;
+      const unsigned write_ind = jump * shifted_block_id + block_tid;
      const unsigned v_r_key =
        write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
                     : write_ind;
-      v_r[v_r_key] = padding ? (tid % (2 * padding) < padding) ? v[read_ind] + v[read_ind + jump] : P::zero()
-                             : v[read_ind] + v[read_ind + jump];
+      v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
    }

    // this kernel performs single scalar multiplication
@@ -135,8 +186,8 @@ namespace msm {
        buckets_indices[current_index] =
          (msm_index << (c + bm_bitsize)) | (bm << c) |
          bucket_index; // the bucket module number and the msm number are appended at the msbs
-        if (scalar == S::zero() || bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped
-        point_indices[current_index] = tid % points_size; // the point index is saved for later
+        if (bucket_index == 0) buckets_indices[current_index] = 0; // will be skipped
+        point_indices[current_index] = tid % points_size;          // the point index is saved for later
 #endif
      }
    }
@@ -158,19 +209,6 @@ namespace msm {
      if (tid == 0 && v[size - 1] > cutoff) { result[0] = size; }
    }

-    template <typename S>
-    __global__ void
-    find_max_size(unsigned* bucket_sizes, unsigned* single_bucket_indices, unsigned c, unsigned* largest_bucket_size)
-    {
-      for (int i = 0;; i++) {
-        if (single_bucket_indices[i] & ((1 << c) - 1)) {
-          largest_bucket_size[0] = bucket_sizes[i];
-          largest_bucket_size[1] = i;
-          break;
-        }
-      }
-    }
-
    // this kernel adds up the points in each bucket
    template <typename P, typename A>
    __global__ void accumulate_buckets_kernel(
@@ -188,9 +226,6 @@ namespace msm {
      // constexpr unsigned sign_mask = 0x80000000;
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
      if (tid >= nof_buckets_to_compute) return;
-      if ((single_bucket_indices[tid] & ((1 << c) - 1)) == 0) {
-        return; // skip zero buckets
-      }
 #ifdef SIGNED_DIG // todo - fix
      const unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
      const unsigned bm_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1)) >> c;
@@ -198,7 +233,8 @@ namespace msm {
        msm_index * nof_buckets + bm_index * ((1 << (c - 1)) + 1) + (single_bucket_indices[tid] & ((1 << c) - 1));
 #else
      unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
-      unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
+      const unsigned single_bucket_index = (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
+      unsigned bucket_index = msm_index * nof_buckets + single_bucket_index;
 #endif
      const unsigned bucket_offset = bucket_offsets[tid];
      const unsigned bucket_size = bucket_sizes[tid];
@@ -215,7 +251,8 @@ namespace msm {
 #else
        A point = points[point_ind];
 #endif
-        bucket = i ? bucket + point : P::from_affine(point);
+        bucket =
+          i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point));
      }
      buckets[bucket_index] = bucket;
    }
@@ -225,43 +262,40 @@ namespace msm {
      P* __restrict__ buckets,
      unsigned* __restrict__ bucket_offsets,
      unsigned* __restrict__ bucket_sizes,
-      unsigned* __restrict__ single_bucket_indices,
-      const unsigned* __restrict__ point_indices,
+      unsigned* __restrict__ large_bucket_thread_indices,
+      unsigned* __restrict__ point_indices,
      A* __restrict__ points,
-      const unsigned nof_buckets,
      const unsigned nof_buckets_to_compute,
-      const unsigned msm_idx_shift,
      const unsigned c,
-      const unsigned threads_per_bucket,
-      const unsigned max_run_length)
+      const int points_per_thread,
+      // log_nof_buckets_to_compute should be equal to ceil(log(nof_buckets_to_compute))
+      const unsigned log_nof_buckets_to_compute,
+      const unsigned nof_threads)
    {
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-      unsigned large_bucket_index = tid / threads_per_bucket;
-      unsigned bucket_segment_index = tid % threads_per_bucket;
-      if (tid >= nof_buckets_to_compute * threads_per_bucket) { return; }
-      if ((single_bucket_indices[large_bucket_index] & ((1 << c) - 1)) == 0) { // don't need
-        return;                                                                // skip zero buckets
-      }
-      unsigned write_bucket_index = bucket_segment_index * nof_buckets_to_compute + large_bucket_index;
-      const unsigned bucket_offset = bucket_offsets[large_bucket_index] + bucket_segment_index * max_run_length;
-      const unsigned bucket_size = bucket_sizes[large_bucket_index] > bucket_segment_index * max_run_length
-                                     ? bucket_sizes[large_bucket_index] - bucket_segment_index * max_run_length
-                                     : 0;
+      if (tid >= nof_threads) return;
+      int bucket_segment_index = large_bucket_thread_indices[tid] >> log_nof_buckets_to_compute;
+      large_bucket_thread_indices[tid] &= ((1 << log_nof_buckets_to_compute) - 1);
+      int bucket_ind = large_bucket_thread_indices[tid];
+      const unsigned bucket_offset = bucket_offsets[bucket_ind] + bucket_segment_index * points_per_thread;
+      const unsigned bucket_size = max(0, (int)bucket_sizes[bucket_ind] - bucket_segment_index * points_per_thread);
      P bucket;
-      unsigned run_length = min(bucket_size, max_run_length);
+      unsigned run_length = min(bucket_size, points_per_thread);
      for (unsigned i = 0; i < run_length;
           i++) { // add the relevant points starting from the relevant offset up to the bucket size
        unsigned point_ind = point_indices[bucket_offset + i];
        A point = points[point_ind];
-        bucket = i ? bucket + point : P::from_affine(point); // init empty buckets
+        bucket =
+          i ? (point == A::zero() ? bucket : bucket + point) : (point == A::zero() ? P::zero() : P::from_affine(point));
      }
-      buckets[write_bucket_index] = run_length ? bucket : P::zero();
+      buckets[tid] = run_length ? bucket : P::zero();
    }

    template <typename P>
    __global__ void distribute_large_buckets_kernel(
      const P* large_buckets,
      P* buckets,
+      const unsigned* sorted_bucket_sizes_sum,
      const unsigned* single_bucket_indices,
      const unsigned size,
      const unsigned nof_buckets,
@@ -272,7 +306,8 @@ namespace msm {

      unsigned msm_index = single_bucket_indices[tid] >> msm_idx_shift;
      unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid] & ((1 << msm_idx_shift) - 1));
-      buckets[bucket_index] = large_buckets[tid];
+      unsigned large_bucket_index = sorted_bucket_sizes_sum[tid] + tid;
+      buckets[bucket_index] = large_buckets[large_bucket_index];
    }

    // this kernel sums the entire bucket module
@@ -370,7 +405,6 @@ namespace msm {
      }

      S* d_scalars;
-      A* d_points;
      if (!are_scalars_on_device) {
        // copy scalars to gpu
        CHK_IF_RETURN(cudaMallocAsync(&d_scalars, sizeof(S) * nof_scalars, stream));
@@ -378,15 +412,7 @@ namespace msm {
      } else {
        d_scalars = scalars;
      }
-      cudaStream_t stream_points;
-      if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points));
-      if (!are_points_on_device) {
-        // copy points to gpu
-        CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points));
-        CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
-      } else {
-        d_points = points;
-      }
+
      if (are_scalars_montgomery_form) {
        if (are_scalars_on_device) {
          S* d_mont_scalars;
@@ -396,6 +422,99 @@ namespace msm {
        } else
          CHK_IF_RETURN(mont::FromMontgomery(d_scalars, nof_scalars, stream, d_scalars));
      }
+
+      unsigned nof_bms_per_msm = (bitsize + c - 1) / c;
+      unsigned* bucket_indices;
+      unsigned* point_indices;
+      unsigned* sorted_bucket_indices;
+      unsigned* sorted_point_indices;
+      CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&sorted_point_indices, sizeof(unsigned) * nof_scalars * nof_bms_per_msm, stream));
+
+      unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm));
+      // split scalars into digits
+      unsigned NUM_THREADS = 1 << 10;
+      unsigned NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS;
+      split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+        bucket_indices, point_indices, d_scalars, nof_scalars, nof_points, single_msm_size, nof_bms_per_msm, bm_bitsize,
+        c);
+
+      // sort indices - the indices are sorted from smallest to largest in order to group together the points that
+      // belong to each bucket
+      unsigned* sort_indices_temp_storage{};
+      size_t sort_indices_temp_storage_bytes;
+      // The second to last parameter is the default value supplied explicitly to allow passing the stream
+      // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
+      // more info
+      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
+        sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
+        point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream));
+      // The second to last parameter is the default value supplied explicitly to allow passing the stream
+      // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
+      // more info
+      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
+        sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices, sorted_bucket_indices,
+        point_indices, sorted_point_indices, nof_scalars * nof_bms_per_msm, 0, sizeof(unsigned) * 8, stream));
+      CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream));
+      CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream));
+      CHK_IF_RETURN(cudaFreeAsync(point_indices, stream));
+
+      // compute number of bucket modules and number of buckets in each module
+      unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size;
+#ifdef SIGNED_DIG
+      const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits
+#else
+      // minus nof_bms_per_msm because zero bucket is not included in each bucket module
+      const unsigned nof_buckets = (nof_bms_per_msm << c) - nof_bms_per_msm;
+#endif
+      const unsigned total_nof_buckets = nof_buckets * batch_size;
+
+      // find bucket_sizes
+      unsigned* single_bucket_indices;
+      unsigned* bucket_sizes;
+      unsigned* nof_buckets_to_compute;
+      // +1 here and in other places because there still is zero index corresponding to zero bucket at this point
+      CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * (total_nof_buckets + 1), stream));
+      CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * (total_nof_buckets + 1), stream));
+      CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream));
+      unsigned* encode_temp_storage{};
+      size_t encode_temp_storage_bytes = 0;
+      CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
+        encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
+        nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream));
+      CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
+        encode_temp_storage, encode_temp_storage_bytes, sorted_bucket_indices, single_bucket_indices, bucket_sizes,
+        nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
+      CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream));
+      CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_indices, stream));
+
+      // get offsets - where does each new bucket begin
+      unsigned* bucket_offsets;
+      CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * (total_nof_buckets + 1), stream));
+      unsigned* offsets_temp_storage{};
+      size_t offsets_temp_storage_bytes = 0;
+      CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
+        offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream));
+      CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
+        offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets + 1, stream));
+      CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream));
+
+      A* d_points;
+      cudaStream_t stream_points;
+      if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaStreamCreate(&stream_points));
+      if (!are_points_on_device) {
+        // copy points to gpu
+        CHK_IF_RETURN(cudaMallocAsync(&d_points, sizeof(A) * nof_points, stream_points));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_points, points, sizeof(A) * nof_points, cudaMemcpyHostToDevice, stream_points));
+      } else {
+        d_points = points;
+      }
+
      if (are_points_montgomery_form) {
        if (are_points_on_device) {
          A* d_mont_points;
@@ -412,111 +531,26 @@ namespace msm {
      }

      P* buckets;
-      // compute number of bucket modules and number of buckets in each module
-      unsigned nof_bms_per_msm = (bitsize + c - 1) / c;
-      unsigned bm_bitsize = (unsigned)ceil(log2(nof_bms_per_msm));
-      unsigned nof_bms_in_batch = nof_bms_per_msm * batch_size;
-#ifdef SIGNED_DIG
-      const unsigned nof_buckets = nof_bms_per_msm * ((1 << (c - 1)) + 1); // signed digits
-#else
-      const unsigned nof_buckets = nof_bms_per_msm << c;
-#endif
-      const unsigned total_nof_buckets = nof_buckets * batch_size;
-      CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * total_nof_buckets, stream));
+      CHK_IF_RETURN(cudaMallocAsync(&buckets, sizeof(P) * (total_nof_buckets + nof_bms_in_batch), stream));

      // launch the bucket initialization kernel with maximum threads
-      unsigned NUM_THREADS = 1 << 10;
-      unsigned NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
-      initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets);
-
-      unsigned* bucket_indices;
-      unsigned* point_indices;
-      CHK_IF_RETURN(cudaMallocAsync(&bucket_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream));
-      CHK_IF_RETURN(cudaMallocAsync(&point_indices, sizeof(unsigned) * nof_scalars * (nof_bms_per_msm + 1), stream));
-
-      // split scalars into digits
      NUM_THREADS = 1 << 10;
-      NUM_BLOCKS = (nof_scalars + NUM_THREADS - 1) / NUM_THREADS;
-      split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-        bucket_indices + nof_scalars, point_indices + nof_scalars, d_scalars, nof_scalars, nof_points, single_msm_size,
-        nof_bms_per_msm, bm_bitsize, c); //+nof_scalars - leaving the first bm free for the out of place sort later
+      NUM_BLOCKS = (total_nof_buckets + nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
+      initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, total_nof_buckets + nof_bms_in_batch);

-      // sort indices - the indices are sorted from smallest to largest in order to group together the points that
-      // belong to each bucket
-      unsigned* sort_indices_temp_storage{};
-      size_t sort_indices_temp_storage_bytes;
-      // The second to last parameter is the default value supplied explicitly to allow passing the stream
-      // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
-      // more info
-      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
-        sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + nof_scalars, bucket_indices,
-        point_indices + nof_scalars, point_indices, nof_scalars, 0, sizeof(unsigned) * 8, stream));
-      CHK_IF_RETURN(cudaMallocAsync(&sort_indices_temp_storage, sort_indices_temp_storage_bytes, stream));
-      for (unsigned i = 0; i < nof_bms_per_msm; i++) {
-        unsigned offset_out = i * nof_scalars;
-        unsigned offset_in = offset_out + nof_scalars;
-        // The second to last parameter is the default value supplied explicitly to allow passing the stream
-        // See https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html#a65e82152de448c6373ed9563aaf8af7e for
-        // more info
-        CHK_IF_RETURN(cub::DeviceRadixSort::SortPairs(
-          sort_indices_temp_storage, sort_indices_temp_storage_bytes, bucket_indices + offset_in,
-          bucket_indices + offset_out, point_indices + offset_in, point_indices + offset_out, nof_scalars, 0,
-          sizeof(unsigned) * 8, stream));
-      }
-      CHK_IF_RETURN(cudaFreeAsync(sort_indices_temp_storage, stream));
-
-      // find bucket_sizes
-      unsigned* single_bucket_indices;
-      unsigned* bucket_sizes;
-      unsigned* nof_buckets_to_compute;
-      CHK_IF_RETURN(cudaMallocAsync(&single_bucket_indices, sizeof(unsigned) * total_nof_buckets, stream));
-      CHK_IF_RETURN(cudaMallocAsync(&bucket_sizes, sizeof(unsigned) * total_nof_buckets, stream));
-      CHK_IF_RETURN(cudaMallocAsync(&nof_buckets_to_compute, sizeof(unsigned), stream));
-      unsigned* encode_temp_storage{};
-      size_t encode_temp_storage_bytes = 0;
-      CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
-        encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
-        nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
-      CHK_IF_RETURN(cudaMallocAsync(&encode_temp_storage, encode_temp_storage_bytes, stream));
-      CHK_IF_RETURN(cub::DeviceRunLengthEncode::Encode(
-        encode_temp_storage, encode_temp_storage_bytes, bucket_indices, single_bucket_indices, bucket_sizes,
-        nof_buckets_to_compute, nof_bms_per_msm * nof_scalars, stream));
-      CHK_IF_RETURN(cudaFreeAsync(encode_temp_storage, stream));
-
-      // get offsets - where does each new bucket begin
-      unsigned* bucket_offsets;
-      CHK_IF_RETURN(cudaMallocAsync(&bucket_offsets, sizeof(unsigned) * total_nof_buckets, stream));
-      unsigned* offsets_temp_storage{};
-      size_t offsets_temp_storage_bytes = 0;
-      CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
-        offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream));
-      CHK_IF_RETURN(cudaMallocAsync(&offsets_temp_storage, offsets_temp_storage_bytes, stream));
-      CHK_IF_RETURN(cub::DeviceScan::ExclusiveSum(
-        offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, total_nof_buckets, stream));
-      CHK_IF_RETURN(cudaFreeAsync(offsets_temp_storage, stream));
+      // removing zero bucket, if it exists
+      unsigned smallest_bucket_index;
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        &smallest_bucket_index, single_bucket_indices, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
+      // maybe zero bucket is empty after all? in this case zero_bucket_offset is set to 0
+      unsigned zero_bucket_offset = (smallest_bucket_index == 0) ? 1 : 0;

      // sort by bucket sizes
      unsigned h_nof_buckets_to_compute;
      CHK_IF_RETURN(cudaMemcpyAsync(
        &h_nof_buckets_to_compute, nof_buckets_to_compute, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
-
-      // if all points are 0 just return point 0
-      if (h_nof_buckets_to_compute == 0) {
-        if (!are_results_on_device) {
-          for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) {
-            final_result[batch_element] = P::zero();
-          }
-        } else {
-          P* h_final_result = (P*)malloc(sizeof(P) * batch_size);
-          for (unsigned batch_element = 0; batch_element < batch_size; ++batch_element) {
-            h_final_result[batch_element] = P::zero();
-          }
-          CHK_IF_RETURN(
-            cudaMemcpyAsync(final_result, h_final_result, sizeof(P) * batch_size, cudaMemcpyHostToDevice, stream));
-        }
-
-        return CHK_LAST();
-      }
+      CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream));
+      h_nof_buckets_to_compute -= zero_bucket_offset;

      unsigned* sorted_bucket_sizes;
      CHK_IF_RETURN(cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned) * h_nof_buckets_to_compute, stream));
@@ -525,13 +559,16 @@ namespace msm {
      unsigned* sort_offsets_temp_storage{};
      size_t sort_offsets_temp_storage_bytes = 0;
      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
-        sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
-        sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
+        sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
+        sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0,
+        sizeof(unsigned) * 8, stream));
      CHK_IF_RETURN(cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream));
      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
-        sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes, bucket_offsets,
-        sorted_bucket_offsets, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
+        sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
+        sorted_bucket_sizes, bucket_offsets + zero_bucket_offset, sorted_bucket_offsets, h_nof_buckets_to_compute, 0,
+        sizeof(unsigned) * 8, stream));
      CHK_IF_RETURN(cudaFreeAsync(sort_offsets_temp_storage, stream));
+      CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream));

      unsigned* sorted_single_bucket_indices;
      CHK_IF_RETURN(
@@ -539,92 +576,128 @@ namespace msm {
      unsigned* sort_single_temp_storage{};
      size_t sort_single_temp_storage_bytes = 0;
      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
-        sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes,
-        single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8,
-        stream));
+        sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
+        sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices,
+        h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
      CHK_IF_RETURN(cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream));
      CHK_IF_RETURN(cub::DeviceRadixSort::SortPairsDescending(
-        sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes, sorted_bucket_sizes,
-        single_bucket_indices, sorted_single_bucket_indices, h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8,
-        stream));
+        sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes + zero_bucket_offset,
+        sorted_bucket_sizes, single_bucket_indices + zero_bucket_offset, sorted_single_bucket_indices,
+        h_nof_buckets_to_compute, 0, sizeof(unsigned) * 8, stream));
      CHK_IF_RETURN(cudaFreeAsync(sort_single_temp_storage, stream));
+      CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream));
+      CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream));

      // find large buckets
-      unsigned avarage_size = single_msm_size / (1 << c);
-      unsigned bucket_th = large_bucket_factor * avarage_size;
+      unsigned average_bucket_size = single_msm_size / (1 << c);
+      // how large a bucket must be to qualify as a "large bucket"
+      unsigned bucket_th = large_bucket_factor * average_bucket_size;
      unsigned* nof_large_buckets;
      CHK_IF_RETURN(cudaMallocAsync(&nof_large_buckets, sizeof(unsigned), stream));
      CHK_IF_RETURN(cudaMemset(nof_large_buckets, 0, sizeof(unsigned)));

-      unsigned TOTAL_THREADS = 129000; // todo - device dependent
+      unsigned TOTAL_THREADS = 129000; // TODO: device dependent
      unsigned cutoff_run_length = max(2, h_nof_buckets_to_compute / TOTAL_THREADS);
      unsigned cutoff_nof_runs = (h_nof_buckets_to_compute + cutoff_run_length - 1) / cutoff_run_length;
-      NUM_THREADS = min(1 << 5, cutoff_nof_runs);
+      NUM_THREADS = 1 << 5;
      NUM_BLOCKS = (cutoff_nof_runs + NUM_THREADS - 1) / NUM_THREADS;
-      find_cutoff_kernel<S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-        sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets);
-
+      if (h_nof_buckets_to_compute > 0 && bucket_th > 0)
+        find_cutoff_kernel<S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+          sorted_bucket_sizes, h_nof_buckets_to_compute, bucket_th, cutoff_run_length, nof_large_buckets);
      unsigned h_nof_large_buckets;
      CHK_IF_RETURN(
        cudaMemcpyAsync(&h_nof_large_buckets, nof_large_buckets, sizeof(unsigned), cudaMemcpyDeviceToHost, stream));
-
-      unsigned* max_res;
-      CHK_IF_RETURN(cudaMallocAsync(&max_res, sizeof(unsigned) * 2, stream));
-      find_max_size<S><<<1, 1, 0, stream>>>(sorted_bucket_sizes, sorted_single_bucket_indices, c, max_res);
-
-      unsigned h_max_res[2];
-      CHK_IF_RETURN(cudaMemcpyAsync(h_max_res, max_res, sizeof(unsigned) * 2, cudaMemcpyDeviceToHost, stream));
-      unsigned h_largest_bucket_size = h_max_res[0];
-      unsigned h_nof_zero_large_buckets = h_max_res[1];
-      unsigned large_buckets_to_compute =
-        h_nof_large_buckets > h_nof_zero_large_buckets ? h_nof_large_buckets - h_nof_zero_large_buckets : 0;
+      CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream));

      if (!are_points_on_device || are_points_montgomery_form) {
        // by this point, points need to be already uploaded and un-Montgomeried
        CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_points_uploaded));
+        CHK_IF_RETURN(cudaEventDestroy(event_points_uploaded));
        CHK_IF_RETURN(cudaStreamDestroy(stream_points));
      }

      cudaStream_t stream_large_buckets;
      cudaEvent_t event_large_buckets_accumulated;
-      P* large_buckets;
-      if (large_buckets_to_compute > 0 && bucket_th > 0) {
+      // this is where handling of large buckets happens (if there are any)
+      if (h_nof_large_buckets > 0 && bucket_th > 0) {
        CHK_IF_RETURN(cudaStreamCreate(&stream_large_buckets));
        CHK_IF_RETURN(cudaEventCreateWithFlags(&event_large_buckets_accumulated, cudaEventDisableTiming));

-        unsigned threads_per_bucket =
-          1 << (unsigned)ceil(log2((h_largest_bucket_size + bucket_th - 1) / bucket_th)); // global param
-        unsigned max_bucket_size_run_length = (h_largest_bucket_size + threads_per_bucket - 1) / threads_per_bucket;
-        unsigned total_large_buckets_size = large_buckets_to_compute * threads_per_bucket;
-        CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * total_large_buckets_size, stream));
+        unsigned* sorted_bucket_sizes_sum;
+        CHK_IF_RETURN(cudaMallocAsync(
+          &sorted_bucket_sizes_sum, sizeof(unsigned) * (h_nof_large_buckets + 1), stream_large_buckets));
+        CHK_IF_RETURN(cudaMemsetAsync(sorted_bucket_sizes_sum, 0, sizeof(unsigned), stream_large_buckets));
+        unsigned* large_bucket_temp_storage{};
+        size_t large_bucket_temp_storage_bytes = 0;
+        CHK_IF_RETURN(cub::DeviceScan::InclusiveSum(
+          large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1,
+          h_nof_large_buckets, stream_large_buckets));
+        CHK_IF_RETURN(
+          cudaMallocAsync(&large_bucket_temp_storage, large_bucket_temp_storage_bytes, stream_large_buckets));
+        CHK_IF_RETURN(cub::DeviceScan::InclusiveSum(
+          large_bucket_temp_storage, large_bucket_temp_storage_bytes, sorted_bucket_sizes, sorted_bucket_sizes_sum + 1,
+          h_nof_large_buckets, stream_large_buckets));
+        CHK_IF_RETURN(cudaFreeAsync(large_bucket_temp_storage, stream_large_buckets));
+        unsigned h_nof_pts_in_large_buckets;
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          &h_nof_pts_in_large_buckets, sorted_bucket_sizes_sum + h_nof_large_buckets, sizeof(unsigned),
+          cudaMemcpyDeviceToHost, stream_large_buckets));
+        unsigned h_largest_bucket;
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          &h_largest_bucket, sorted_bucket_sizes, sizeof(unsigned), cudaMemcpyDeviceToHost, stream_large_buckets));

-        NUM_THREADS = min(1 << 8, total_large_buckets_size);
-        NUM_BLOCKS = (total_large_buckets_size + NUM_THREADS - 1) / NUM_THREADS;
+        // the number of threads for large buckets has an extra h_nof_large_buckets term to account for bucket sizes
+        // unevenly divisible by average_bucket_size. there are similar corrections elsewhere when accessing large
+        // buckets
+        unsigned large_buckets_nof_threads =
+          (h_nof_pts_in_large_buckets + average_bucket_size - 1) / average_bucket_size + h_nof_large_buckets;
+        unsigned log_nof_large_buckets = (unsigned)ceil(log2(h_nof_large_buckets));
+        unsigned* large_bucket_indices;
+        CHK_IF_RETURN(cudaMallocAsync(&large_bucket_indices, sizeof(unsigned) * large_buckets_nof_threads, stream));
+        NUM_THREADS = min(1 << 8, h_nof_large_buckets);
+        NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
+        initialize_large_bucket_indices<P><<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
+          sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets, log_nof_large_buckets,
+          large_bucket_indices);
+
+        P* large_buckets;
+        CHK_IF_RETURN(cudaMallocAsync(&large_buckets, sizeof(P) * large_buckets_nof_threads, stream_large_buckets));
+
+        NUM_THREADS = min(1 << 8, large_buckets_nof_threads);
+        NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
        accumulate_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
-          large_buckets, sorted_bucket_offsets + h_nof_zero_large_buckets,
-          sorted_bucket_sizes + h_nof_zero_large_buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets,
-          point_indices, d_points, nof_buckets, large_buckets_to_compute, c + bm_bitsize, c, threads_per_bucket,
-          max_bucket_size_run_length);
+          large_buckets, sorted_bucket_offsets, sorted_bucket_sizes, large_bucket_indices, sorted_point_indices,
+          d_points, h_nof_large_buckets, c, average_bucket_size, log_nof_large_buckets, large_buckets_nof_threads);

+        NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
+        NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
+        // normalization is needed to update buckets sizes and offsets due to reduction that already took place
+        normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
+          sorted_bucket_sizes_sum, average_bucket_size, h_nof_large_buckets);
        // reduce
-        for (int s = total_large_buckets_size >> 1; s > large_buckets_to_compute - 1; s >>= 1) {
-          NUM_THREADS = min(MAX_TH, s);
-          NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
-          single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
-            large_buckets, large_buckets, s * 2, 0, 0, 0, s);
+        for (int s = h_largest_bucket; s > 1; s = ((s + 1) >> 1)) {
+          NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
+          NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
+          normalize_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
+            sorted_bucket_sizes, s == h_largest_bucket ? average_bucket_size : 2, h_nof_large_buckets);
+          NUM_THREADS = min(MAX_TH, large_buckets_nof_threads);
+          NUM_BLOCKS = (large_buckets_nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+          sum_reduction_variable_size_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
+            large_buckets, sorted_bucket_sizes_sum, sorted_bucket_sizes, large_bucket_indices,
+            large_buckets_nof_threads);
        }
+        CHK_IF_RETURN(cudaFreeAsync(large_bucket_indices, stream_large_buckets));

        // distribute
-        NUM_THREADS = min(MAX_TH, large_buckets_to_compute);
-        NUM_BLOCKS = (large_buckets_to_compute + NUM_THREADS - 1) / NUM_THREADS;
+        NUM_THREADS = min(MAX_TH, h_nof_large_buckets);
+        NUM_BLOCKS = (h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
        distribute_large_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_large_buckets>>>(
-          large_buckets, buckets, sorted_single_bucket_indices + h_nof_zero_large_buckets, large_buckets_to_compute,
-          nof_buckets, c + bm_bitsize);
+          large_buckets, buckets, sorted_bucket_sizes_sum, sorted_single_bucket_indices, h_nof_large_buckets,
+          nof_buckets + nof_bms_per_msm, c + bm_bitsize);
+        CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream_large_buckets));
+        CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes_sum, stream_large_buckets));

        CHK_IF_RETURN(cudaEventRecord(event_large_buckets_accumulated, stream_large_buckets));
-        CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets));
-      } else {
-        h_nof_large_buckets = 0;
      }

      // launch the accumulation kernel with maximum threads
@@ -633,13 +706,18 @@ namespace msm {
        NUM_BLOCKS = (h_nof_buckets_to_compute - h_nof_large_buckets + NUM_THREADS - 1) / NUM_THREADS;
        accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
          buckets, sorted_bucket_offsets + h_nof_large_buckets, sorted_bucket_sizes + h_nof_large_buckets,
-          sorted_single_bucket_indices + h_nof_large_buckets, point_indices, d_points, nof_buckets,
-          h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
+          sorted_single_bucket_indices + h_nof_large_buckets, sorted_point_indices, d_points,
+          nof_buckets + nof_bms_per_msm, h_nof_buckets_to_compute - h_nof_large_buckets, c + bm_bitsize, c);
      }
-
-      if (large_buckets_to_compute > 0 && bucket_th > 0)
+      CHK_IF_RETURN(cudaFreeAsync(sorted_point_indices, stream));
+      CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream));
+      CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream));
+      CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream));
+      if (h_nof_large_buckets > 0 && bucket_th > 0) {
        // all the large buckets need to be accumulated before the final summation
        CHK_IF_RETURN(cudaStreamWaitEvent(stream, event_large_buckets_accumulated));
+        CHK_IF_RETURN(cudaStreamDestroy(stream_large_buckets));
+      }

 #ifdef SSM_SUM
      // sum each bucket
@@ -676,7 +754,7 @@ namespace msm {
        unsigned source_bits_count = c;
        // bool odd_source_c = source_bits_count % 2;
        unsigned source_windows_count = nof_bms_per_msm;
-        unsigned source_buckets_count = nof_buckets;
+        unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
        unsigned target_windows_count = 0;
        P* source_buckets = buckets;
        buckets = nullptr;
@@ -698,20 +776,19 @@ namespace msm {
            for (unsigned j = 0; j < target_bits_count; j++) {
              const bool is_first_iter = (j == 0);
              const bool is_last_iter = (j == target_bits_count - 1);
-              unsigned nof_threads = (source_buckets_count >> (1 + j)) * batch_size;
+              unsigned nof_threads =
+                (((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
              NUM_THREADS = min(MAX_TH, nof_threads);
              NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
                is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
                1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
-                0 /*=padding*/, nof_threads);
+                (1 << target_bits_count) - 1, nof_threads);

-              NUM_THREADS = min(MAX_TH, nof_threads);
-              NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
                is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
                1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
-                0 /*=padding*/, nof_threads);
+                (1 << target_bits_count) - 1, nof_threads);
            }
          }
          if (target_bits_count == 1) {
@@ -761,24 +838,10 @@ namespace msm {
          cudaMemcpyAsync(final_result, d_final_result, sizeof(P) * batch_size, cudaMemcpyDeviceToHost, stream));

      // free memory
-      if (!are_scalars_on_device) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream));
-      if (!are_points_on_device) CHK_IF_RETURN(cudaFreeAsync(d_points, stream));
+      if (!are_scalars_on_device || are_scalars_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_scalars, stream));
+      if (!are_points_on_device || are_points_montgomery_form) CHK_IF_RETURN(cudaFreeAsync(d_points, stream));
      if (!are_results_on_device) CHK_IF_RETURN(cudaFreeAsync(d_final_result, stream));
      CHK_IF_RETURN(cudaFreeAsync(buckets, stream));
-#ifndef PHASE1_TEST
-      CHK_IF_RETURN(cudaFreeAsync(bucket_indices, stream));
-      CHK_IF_RETURN(cudaFreeAsync(point_indices, stream));
-      CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream));
-      CHK_IF_RETURN(cudaFreeAsync(bucket_sizes, stream));
-      CHK_IF_RETURN(cudaFreeAsync(nof_buckets_to_compute, stream));
-      CHK_IF_RETURN(cudaFreeAsync(bucket_offsets, stream));
-#endif
-      CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_sizes, stream));
-      CHK_IF_RETURN(cudaFreeAsync(sorted_bucket_offsets, stream));
-      CHK_IF_RETURN(cudaFreeAsync(sorted_single_bucket_indices, stream));
-      CHK_IF_RETURN(cudaFreeAsync(nof_large_buckets, stream));
-      CHK_IF_RETURN(cudaFreeAsync(max_res, stream));
-      if (large_buckets_to_compute > 0 && bucket_th > 0) CHK_IF_RETURN(cudaFreeAsync(large_buckets, stream));

      if (!is_async) CHK_IF_RETURN(cudaStreamSynchronize(stream));

--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -196,7 +196,7 @@ int main()
  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
  config.is_big_triangle = true;
  config.are_results_on_device = false;
-  // std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
  auto begin = std::chrono::high_resolution_clock::now();
  msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
  // test_reduce_triangle(scalars);
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -121,7 +121,8 @@ namespace poseidon {
   */
  // Stas: I have an issue with the number of argumnets
  template <typename S>
-  cudaError_t init_optimized_poseidon_constants(device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
+  cudaError_t
+  init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);

  /**
   * Compute the poseidon hash over a sequence of preimages.
--- a/icicle/primitives/affine.cuh
+++ b/icicle/primitives/affine.cuh
@@ -11,6 +11,8 @@ public:

  static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }

+  static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }
+
  static HOST_DEVICE_INLINE Affine ToMontgomery(const Affine& point)
  {
    return {FF::ToMontgomery(point.x), FF::ToMontgomery(point.y)};
--- a/wrappers/rust/icicle-core/Cargo.toml
+++ b/wrappers/rust/icicle-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-core"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = ["Ingonyama"]
 description = "A library for GPU ZK acceleration by Ingonyama"
--- a/wrappers/rust/icicle-core/src/curve.rs
+++ b/wrappers/rust/icicle-core/src/curve.rs
@@ -5,6 +5,8 @@ use crate::traits::{FieldImpl, MontgomeryConvertible};
 use ark_ec::models::CurveConfig as ArkCurveConfig;
 #[cfg(feature = "arkworks")]
 use ark_ec::short_weierstrass::{Affine as ArkAffine, Projective as ArkProjective, SWCurveConfig};
+#[cfg(feature = "arkworks")]
+use ark_ec::AffineRepr;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use std::fmt::Debug;
@@ -147,13 +149,17 @@ where
    type ArkEquivalent = ArkAffine<C::ArkSWConfig>;

    fn to_ark(&self) -> Self::ArkEquivalent {
-        let proj_x = self
-            .x
-            .to_ark();
-        let proj_y = self
-            .y
-            .to_ark();
-        Self::ArkEquivalent::new_unchecked(proj_x, proj_y)
+        if *self == Self::zero() {
+            Self::ArkEquivalent::zero()
+        } else {
+            let ark_x = self
+                .x
+                .to_ark();
+            let ark_y = self
+                .y
+                .to_ark();
+            Self::ArkEquivalent::new_unchecked(ark_x, ark_y)
+        }
    }

    fn from_ark(ark: Self::ArkEquivalent) -> Self {
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -16,6 +16,15 @@ use ark_ec::VariableBaseMSM;
 #[cfg(feature = "arkworks")]
 use ark_std::{rand::Rng, test_rng, UniformRand};

+fn generate_random_affine_points_with_zeroes<C: Curve>(size: usize, num_zeroes: usize) -> Vec<Affine<C>> {
+    let rng = &mut test_rng();
+    let mut points = C::generate_random_affine_points(size);
+    for _ in 0..num_zeroes {
+        points[rng.gen_range(0..size)] = Affine::<C>::zero();
+    }
+    points
+}
+
 pub fn check_msm<C: Curve + MSM<C>>()
 where
    <C::ScalarField as FieldImpl>::Config: GenerateRandom<C::ScalarField>,
@@ -36,7 +45,7 @@ where
            let test_sizes = [4, 8, 16, 32, 64, 128, 256, 1000, 1 << 18];
            let mut msm_results = HostOrDeviceSlice::cuda_malloc(1).unwrap();
            for test_size in test_sizes {
-                let points = C::generate_random_affine_points(test_size);
+                let points = generate_random_affine_points_with_zeroes(test_size, 2);
                let scalars = <C::ScalarField as FieldImpl>::Config::generate_random(test_size);
                let points_ark: Vec<_> = points
                    .iter()
@@ -101,7 +110,7 @@ where
    let batch_sizes = [1, 3, 1 << 4];
    for test_size in test_sizes {
        for batch_size in batch_sizes {
-            let points = C::generate_random_affine_points(test_size);
+            let points = generate_random_affine_points_with_zeroes(test_size, 10);
            let scalars = <C::ScalarField as FieldImpl>::Config::generate_random(test_size * batch_size);
            // a version of batched msm without using `cfg.points_size`, requires copying bases
            let points_cloned: Vec<Affine<C>> = std::iter::repeat(points.clone())
@@ -123,6 +132,7 @@ where
            cfg.ctx
                .stream = &stream;
            cfg.is_async = true;
+            cfg.large_bucket_factor = 2;
            msm(&scalars_h, &points_h, &cfg, &mut msm_results_1).unwrap();
            msm(&scalars_h, &points_d, &cfg, &mut msm_results_2).unwrap();

@@ -170,15 +180,16 @@ where
    C::ScalarField: ArkConvertible<ArkEquivalent = <C::ArkSWConfig as ArkCurveConfig>::ScalarField>,
    C::BaseField: ArkConvertible<ArkEquivalent = <C::ArkSWConfig as ArkCurveConfig>::BaseField>,
 {
-    let test_sizes = [1 << 6, 1000];
-    let test_threshold = 1 << 8;
-    let batch_sizes = [1, 3, 1 << 8];
+    let test_sizes = [1 << 10, 10000];
+    let test_threshold = 1 << 11;
+    let batch_sizes = [1, 3, 1 << 4];
    let rng = &mut test_rng();
    for test_size in test_sizes {
        for batch_size in batch_sizes {
-            let points = C::generate_random_affine_points(test_size * batch_size);
+            let points = generate_random_affine_points_with_zeroes(test_size * batch_size, 100);
            let mut scalars = vec![C::ScalarField::zero(); test_size * batch_size];
-            for _ in 0..(test_size * batch_size / 2) {
+
+            for _ in 0..(test_size * batch_size) {
                scalars[rng.gen_range(0..test_size * batch_size)] = C::ScalarField::one();
            }
            for _ in test_threshold..test_size {
--- a/wrappers/rust/icicle-cuda-runtime/Cargo.toml
+++ b/wrappers/rust/icicle-cuda-runtime/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-cuda-runtime"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 description = "Ingonyama's Rust wrapper of CUDA runtime"
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-bls12-377"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BLS12-377 pairing friendly elliptic curve by Ingonyama"
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-bls12-381"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BLS12-381 pairing friendly elliptic curve by Ingonyama"
--- a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-bn254"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BN254 pairing friendly elliptic curve by Ingonyama"
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "icicle-bw6-761"
-version = "1.3.0"
+version = "1.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BW6-761 pairing friendly elliptic curve by Ingonyama"