MSM - supporting all window sizes (#534)

This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
2026-01-09 13:07:59 -05:00 · 2024-06-17 15:57:24 +03:00
parent af9ec76506
commit 8936d9c800
46 changed files with 688 additions and 282 deletions
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
  bls12_377::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_377_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(

 extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
  bls12_377::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::affine_t* output_bases);

 extern "C" cudaError_t bls12_377_msm_cuda(
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
  bls12_381::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_381_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(

 extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
  bls12_381::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::affine_t* output_bases);

 extern "C" cudaError_t bls12_381_msm_cuda(
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -19,11 +19,8 @@

 extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
  bn254::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::g2_affine_t* output_bases);

 extern "C" cudaError_t bn254_g2_msm_cuda(
@@ -31,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(

 extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
  bn254::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::affine_t* output_bases);

 extern "C" cudaError_t bn254_msm_cuda(
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
  bw6_761::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::g2_affine_t* output_bases);

 extern "C" cudaError_t bw6_761_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(

 extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
  bw6_761::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::affine_t* output_bases);

 extern "C" cudaError_t bw6_761_msm_cuda(
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -17,11 +17,8 @@

 extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
  grumpkin::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  grumpkin::affine_t* output_bases);

 extern "C" cudaError_t grumpkin_msm_cuda(
--- a/icicle/include/api/templates/curves/msm.h
+++ b/icicle/include/api/templates/curves/msm.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
  ${CURVE}::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_msm_cuda(
--- a/icicle/include/api/templates/curves/msm_g2.h
+++ b/icicle/include/api/templates/curves/msm_g2.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
  ${CURVE}::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::g2_affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_g2_msm_cuda(
--- a/icicle/include/msm/msm.cuh
+++ b/icicle/include/msm/msm.cuh
@@ -43,7 +43,7 @@ namespace msm {
                              *   points, it should be set to the product of MSM size and [batch_size](@ref
                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
+                              *   [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
                              *   there needs to be equal to the one used here. Larger values decrease the
                              *   number of computations to make, on-line memory footprint, but increase the static
                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
@@ -52,7 +52,7 @@ namespace msm {
                              *   means more on-line memory footprint but also more parallelism and less computational
                              *   complexity (up to a certain point). Currently pre-computation is independent of
                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
+                              *   [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
                              *   but if a different (better) upper bound is known, it should be reflected in this
@@ -127,6 +127,26 @@ namespace msm {
  template <typename S, typename A, typename P>
  cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);

+  /**
+   * A function that precomputes MSM bases by extending them with their shifted copies.
+   * e.g.:
+   * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
+   * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
+   * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
+   * @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+   * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
+   * the size of 1 MSM.
+   * @param config [MSMConfig](@ref MSMConfig) used in this MSM.
+   * @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended
+   * points.
+   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   *
+   */
+  template <typename A, typename P>
+  cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points);
+
  /**
   * A function that precomputes MSM bases by extending them with their shifted copies.
   * e.g.:
@@ -148,7 +168,7 @@ namespace msm {
   *
   */
  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
+  [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
    A* bases,
    int bases_size,
    int precompute_factor,
--- a/icicle/src/msm/Makefile
+++ b/icicle/src/msm/Makefile
@@ -1,4 +1,8 @@
+build_msm:
+	mkdir -p work
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
+
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu
-	work/test_msm
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I.  -I../../include tests/msm_test.cu
+	work/test_msm
--- a/icicle/src/msm/extern.cu
+++ b/icicle/src/msm/extern.cu
@@ -8,6 +8,17 @@ using namespace field_config;
 #include "utils/utils.h"

 namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)(
+    affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points)
+  {
+    return precompute_msm_points<affine_t, projective_t>(points, msm_size, config, output_points);
+  }
  /**
   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
   * template parameters (where the curve is given by `-DCURVE` env variable during build):
--- a/icicle/src/msm/extern_g2.cu
+++ b/icicle/src/msm/extern_g2.cu
@@ -8,6 +8,17 @@ using namespace field_config;
 #include "utils/utils.h"

 namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_points_cuda)(
+    g2_affine_t* points, int msm_size, MSMConfig& config, g2_affine_t* output_points)
+  {
+    return precompute_msm_points<g2_affine_t, g2_projective_t>(points, msm_size, config, output_points);
+  }
  /**
   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
   * template parameters (where the curve is given by `-DCURVE` env variable during build):
@@ -26,7 +37,6 @@ namespace msm {
    return precompute_msm_bases<g2_affine_t, g2_projective_t>(
      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
  }
-
  /**
   * Extern "C" version of [msm](@ref msm) function with the following values of template parameters
   * (where the curve is given by `-DCURVE` env variable during build):
--- a/icicle/src/msm/msm.cu
+++ b/icicle/src/msm/msm.cu
@@ -22,7 +22,6 @@ namespace msm {

 #define MAX_TH 256

-    // #define SIGNED_DIG //WIP
    // #define SSM_SUM  //WIP

    template <typename A, typename P>
@@ -74,10 +73,10 @@ namespace msm {
      unsigned* bucket_sizes_sum,
      unsigned* bucket_sizes,
      unsigned* large_bucket_thread_indices,
-      unsigned num_of_threads)
+      unsigned nof_threads)
    {
      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid >= num_of_threads) { return; }
+      if (tid >= nof_threads) { return; }

      unsigned large_bucket_tid = large_bucket_thread_indices[tid];
      unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
@@ -91,12 +90,13 @@ namespace msm {
      P* v_r,
      unsigned block_size,
      unsigned write_stride,
+      unsigned buckets_per_bm,
      unsigned write_phase,
      unsigned step,
-      unsigned num_of_threads)
+      unsigned nof_threads)
    {
      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid >= num_of_threads) { return; }
+      if (tid >= nof_threads) return;

      // we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
      // for write_phase==1, the read pattern is different so we don't skip over anything.
@@ -110,7 +110,7 @@ namespace msm {
      const unsigned read_ind = block_size * shifted_block_id + block_tid;
      const unsigned write_ind = jump * shifted_block_id + block_tid;
      const unsigned v_r_key =
-        write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
+        write_stride ? ((write_ind / buckets_per_bm) * 2 + write_phase) * write_stride + write_ind % buckets_per_bm
                     : write_ind;
      v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
    }
@@ -325,32 +325,50 @@ namespace msm {
    }

    template <typename P>
-    __global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums)
+    __global__ void last_pass_kernel(
+      const P* final_buckets,
+      P* final_sums,
+      unsigned nof_sums_per_batch,
+      unsigned batch_size,
+      unsigned nof_bms_per_batch,
+      unsigned orig_c)
    {
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-      if (tid >= num_sums) return;
-      final_sums[tid] = final_buckets[2 * tid + 1];
+      if (tid >= nof_sums_per_batch * batch_size) return;
+      unsigned batch_index = tid / nof_sums_per_batch;
+      unsigned batch_tid = tid % nof_sums_per_batch;
+      unsigned bm_index = batch_tid / orig_c;
+      unsigned bm_tid = batch_tid % orig_c;
+      for (unsigned c = orig_c; c > 1;) {
+        c = (c + 1) >> 1;
+        bm_index <<= 1;
+        if (bm_tid >= c) {
+          bm_index++;
+          bm_tid -= c;
+        }
+      }
+      final_sums[tid] = final_buckets[2 * (batch_index * nof_bms_per_batch + bm_index) + 1];
    }

    // this kernel computes the final result using the double and add algorithm
    // it is done by a single thread
    template <typename P, typename S>
    __global__ void final_accumulation_kernel(
-      const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned nof_empty_bms, unsigned c)
+      const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
    {
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
      if (tid >= nof_msms) return;
      P final_result = P::zero();
      // Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
      // skipping them.
-      for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
-        final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
-        for (unsigned j = 0; j < c; j++)                                 // double
+      for (unsigned i = nof_results; i > 1; i--) {
+        final_result = final_result + final_sums[i - 1 + tid * nof_results]; // add
+        for (unsigned j = 0; j < c; j++)                                     // double
        {
          final_result = final_result + final_result;
        }
      }
-      final_results[tid] = final_result + final_sums[tid * nof_bms];
+      final_results[tid] = final_result + final_sums[tid * nof_results];
    }

    // this function computes msm using the bucket method
@@ -717,7 +735,8 @@ namespace msm {
        CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream));

      // --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window ---
-      unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0
+      unsigned nof_final_results_per_msm =
+        nof_bms_per_msm; // for big-triangle accumluation this is the number of bucket modules
      P* final_results;
      if (is_big_triangle || c == 1) {
        CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
@@ -726,62 +745,90 @@ namespace msm {
        NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
        big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms_in_batch, c);
      } else {
+        // the recursive reduction algorithm works with 2 types of reduction that can run on parallel streams
+        cudaStream_t stream_reduction;
+        cudaEvent_t event_finished_reduction;
+        CHK_IF_RETURN(cudaStreamCreate(&stream_reduction));
+        CHK_IF_RETURN(cudaEventCreateWithFlags(&event_finished_reduction, cudaEventDisableTiming));
+
        unsigned source_bits_count = c;
        unsigned source_windows_count = nof_bms_per_msm;
-        unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
-        unsigned target_windows_count = 0;
+        unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; // nof buckets per msm including zero buckets
+        unsigned target_windows_count;
        P* source_buckets = buckets;
        buckets = nullptr;
        P* target_buckets;
        P* temp_buckets1;
        P* temp_buckets2;
        for (unsigned i = 0;; i++) {
-          const unsigned target_bits_count = (source_bits_count + 1) >> 1;                 // c/2=8
-          target_windows_count = source_windows_count << 1;                                // nof bms*2 = 32
-          const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
+          const unsigned target_bits_count = (source_bits_count + 1) >> 1;                 // half the bits rounded up
+          target_windows_count = source_windows_count << 1;                                // twice the number of bms
+          const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
+          CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream));
          CHK_IF_RETURN(cudaMallocAsync(
-            &target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets
+            &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size,
+            stream)); // for type1 reduction (strided, bottom window - evens)
          CHK_IF_RETURN(cudaMallocAsync(
-            &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
-          CHK_IF_RETURN(cudaMallocAsync(
-            &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
+            &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size,
+            stream)); // for type2 reduction (serial, top window - odds)
+          initialize_buckets_kernel<<<(target_buckets_count * batch_size + 255) / 256, 256>>>(
+            target_buckets, target_buckets_count * batch_size); // initialization is needed for the odd c case

-          if (source_bits_count > 0) {
-            for (unsigned j = 0; j < target_bits_count; j++) {
-              const bool is_first_iter = (j == 0);
-              const bool is_last_iter = (j == target_bits_count - 1);
-              unsigned nof_threads =
-                (((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
-              NUM_THREADS = max(1, min(MAX_TH, nof_threads));
-              NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+          for (unsigned j = 0; j < target_bits_count; j++) {
+            const bool is_first_iter = (j == 0);
+            const bool is_second_iter = (j == 1);
+            const bool is_last_iter = (j == target_bits_count - 1);
+            const bool is_odd_c = source_bits_count & 1;
+            unsigned nof_threads =
+              (((source_windows_count << target_bits_count) - source_windows_count) << (target_bits_count - 1 - j)) *
+              batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by nof threads
+                          // per section
+            NUM_THREADS = max(1, min(MAX_TH, nof_threads));
+            NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+            if (!is_odd_c || !is_first_iter) { // skip if c is odd and it's the first iteration
              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-                is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
-                1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
-                (1 << target_bits_count) - 1, nof_threads);
-
-              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-                is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
-                1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
+                is_first_iter || (is_second_iter && is_odd_c) ? source_buckets : temp_buckets1,
+                is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j + (is_odd_c ? 1 : 0)),
+                is_last_iter ? 1 << target_bits_count : 0, 1 << target_bits_count, 0 /*=write_phase*/,
                (1 << target_bits_count) - 1, nof_threads);
            }
+
+            nof_threads = (((source_windows_count << (source_bits_count - target_bits_count)) - source_windows_count)
+                           << (target_bits_count - 1 - j)) *
+                          batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by
+                                      // nof threads per section
+            NUM_THREADS = max(1, min(MAX_TH, nof_threads));
+            NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+            single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_reduction>>>(
+              is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
+              1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0,
+              1 << (target_bits_count - (is_odd_c ? 1 : 0)), 1 /*=write_phase*/,
+              (1 << (target_bits_count - (is_odd_c ? 1 : 0))) - 1, nof_threads);
          }
+          CHK_IF_RETURN(cudaEventRecord(event_finished_reduction, stream_reduction));
+          CHK_IF_RETURN(
+            cudaStreamWaitEvent(stream, event_finished_reduction)); // sync streams after every write to target_buckets
          if (target_bits_count == 1) {
            // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed
            // to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction
            // ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b.
+            // precomputation and odd c can cause additional empty windows.
+            nof_final_results_per_msm = min(c * nof_bms_per_msm, bitsize);
            nof_bms_per_msm = target_windows_count;
-            nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0;
-            nof_bms_in_batch = nof_bms_per_msm * batch_size;
+            unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
+
+            CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * total_nof_final_results, stream));

-            CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
            NUM_THREADS = 32;
-            NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
-            last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms_in_batch);
+            NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS;
+            last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+              target_buckets, final_results, nof_final_results_per_msm, batch_size, nof_bms_per_msm, c);
            c = 1;
            CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
            CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream));
            CHK_IF_RETURN(cudaFreeAsync(temp_buckets1, stream));
            CHK_IF_RETURN(cudaFreeAsync(temp_buckets2, stream));
+            CHK_IF_RETURN(cudaStreamDestroy(stream_reduction));
            break;
          }
          CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
@@ -803,8 +850,8 @@ namespace msm {
      NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
      // launch the double and add kernel, a single thread per batch element
      final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-        final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm,
-        nof_empty_bms_per_batch, c);
+        final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size,
+        nof_final_results_per_msm, c);
      CHK_IF_RETURN(cudaFreeAsync(final_results, stream));

      if (!are_results_on_device)
@@ -829,13 +876,7 @@ namespace msm {
    const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize;
    cudaStream_t& stream = config.ctx.stream;

-    unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16;
-    // reduce c to closest power of two (from below) if not using big_triangle reduction logic
-    // TODO: support arbitrary values of c
-    if (!config.is_big_triangle) {
-      while ((c & (c - 1)) != 0)
-        c &= (c - 1);
-    }
+    unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;

    return CHK_STICKY(bucket_method_msm(
      bitsize, c, scalars, points, config.batch_size, msm_size,
@@ -846,7 +887,33 @@ namespace msm {
  }

  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
+  cudaError_t precompute_msm_points(A* points, int msm_size, MSMConfig& config, A* output_points)
+  {
+    CHK_INIT_IF_RETURN();
+
+    cudaStream_t& stream = config.ctx.stream;
+    unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
+
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      output_points, points, sizeof(A) * config.points_size,
+      config.are_points_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream));
+
+    unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1;
+    unsigned shift = c * ((total_nof_bms - 1) / config.precompute_factor + 1);
+
+    unsigned NUM_THREADS = 1 << 8;
+    unsigned NUM_BLOCKS = (config.points_size + NUM_THREADS - 1) / NUM_THREADS;
+    for (int i = 1; i < config.precompute_factor; i++) {
+      left_shift_kernel<A, P><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+        &output_points[(i - 1) * config.points_size], shift, config.points_size,
+        &output_points[i * config.points_size]);
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename A, typename P>
+  [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
    A* bases,
    int bases_size,
    int precompute_factor,
--- a/icicle/src/msm/tests/msm_test.cu
+++ b/icicle/src/msm/tests/msm_test.cu
@@ -1,3 +1,9 @@
+#include "fields/id.h"
+// #define FIELD_ID 2
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
 #include "msm.cu"

 #include <chrono>
@@ -9,7 +15,7 @@
 #include "curves/projective.cuh"
 #include "gpu-utils/device_context.cuh"

-using namespace bn254;
+// using namespace bn254;

 class Dummy_Scalar
 {
@@ -111,20 +117,34 @@ public:

 // switch between dummy and real:

-typedef scalar_t test_scalar;
-typedef projective_t test_projective;
-typedef affine_t test_affine;
+// typedef scalar_t test_scalar;
+// typedef projective_t test_projective;
+// typedef affine_t test_affine;
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;

 // typedef Dummy_Scalar test_scalar;
 // typedef Dummy_Projective test_projective;
 // typedef Dummy_Projective test_affine;

-int main()
+int main(int argc, char** argv)
 {
-  int batch_size = 1;
+  cudaEvent_t start, stop;
+  float msm_time;
+
+  int msm_log_size = (argc > 1) ? atoi(argv[1]) : 17;
+  int msm_size = 1 << msm_log_size;
+  int batch_size = (argc > 2) ? atoi(argv[2]) : 4;
  //   unsigned msm_size = 1<<21;
-  int msm_size = 12180757;
  int N = batch_size * msm_size;
+  int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1;
+  int user_c = (argc > 4) ? atoi(argv[4]) : 15;
+
+  printf(
+    "running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n", CURVE_ID, msm_log_size, batch_size,
+    precomp_factor, user_c);

  test_scalar* scalars = new test_scalar[N];
  test_affine* points = new test_affine[N];
@@ -136,7 +156,8 @@ int main()

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[2];
+  test_projective res[batch_size];
+  test_projective ref[batch_size];
  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];
@@ -149,13 +170,17 @@ int main()

  test_scalar* scalars_d;
  test_affine* points_d;
-  test_projective* large_res_d;
+  test_affine* precomp_points_d;
+  test_projective* res_d;
+  test_projective* ref_d;

-  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
-  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
-  cudaMalloc(&large_res_d, sizeof(test_projective));
-  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
-  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
+  cudaMalloc(&scalars_d, sizeof(test_scalar) * N);
+  cudaMalloc(&points_d, sizeof(test_affine) * N);
+  cudaMalloc(&precomp_points_d, sizeof(test_affine) * N * precomp_factor);
+  cudaMalloc(&res_d, sizeof(test_projective) * batch_size);
+  cudaMalloc(&ref_d, sizeof(test_projective) * batch_size);
+  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice);

  std::cout << "finished copying" << std::endl;

@@ -170,65 +195,93 @@ int main()
    0,      // mempool
  };
  msm::MSMConfig config = {
-    ctx,   // DeviceContext
-    0,     // points_size
-    1,     // precompute_factor
-    0,     // c
-    0,     // bitsize
-    10,    // large_bucket_factor
-    1,     // batch_size
-    false, // are_scalars_on_device
-    false, // are_scalars_montgomery_form
-    false, // are_points_on_device
-    false, // are_points_montgomery_form
-    true,  // are_results_on_device
-    false, // is_big_triangle
-    true,  // is_async
+    ctx,            // DeviceContext
+    N,              // points_size
+    precomp_factor, // precompute_factor
+    user_c,         // c
+    0,              // bitsize
+    10,             // large_bucket_factor
+    batch_size,     // batch_size
+    false,          // are_scalars_on_device
+    false,          // are_scalars_montgomery_form
+    true,           // are_points_on_device
+    false,          // are_points_montgomery_form
+    true,           // are_results_on_device
+    false,          // is_big_triangle
+    true,           // is_async
+    // false,  // segments_reduction
  };

-  auto begin1 = std::chrono::high_resolution_clock::now();
-  msm::msm<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
-  cudaEvent_t msm_end_event;
-  cudaEventCreate(&msm_end_event);
-  auto end1 = std::chrono::high_resolution_clock::now();
-  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
-  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  if (precomp_factor > 1)
+    msm::precompute_msm_points<test_affine, test_projective>(points_d, msm_size, config, precomp_points_d);
+
+  // warm up
+  msm::msm<test_scalar, test_affine, test_projective>(
+    scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
+  cudaDeviceSynchronize();
+
+  // auto begin1 = std::chrono::high_resolution_clock::now();
+  cudaEventRecord(start, stream);
+  msm::msm<test_scalar, test_affine, test_projective>(
+    scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
+  cudaEventRecord(stop, stream);
+  cudaStreamSynchronize(stream);
+  cudaEventElapsedTime(&msm_time, start, stop);
+  // cudaEvent_t msm_end_event;
+  // cudaEventCreate(&msm_end_event);
+  // auto end1 = std::chrono::high_resolution_clock::now();
+  // auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("msm time : %.3f ms.\n", msm_time);
+
+  // reference
+  config.c = 16;
+  config.precompute_factor = 1;
  config.is_big_triangle = true;
-  config.are_results_on_device = false;
-  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
-  std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1])
-            << std::endl;
-  auto begin = std::chrono::high_resolution_clock::now();
-  msm::msm<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
+  config.batch_size = 1;
+  config.points_size = msm_size;
+  // config.segments_reduction = false;
+  for (int i = 0; i < batch_size; i++) {
+    msm::msm<test_scalar, test_affine, test_projective>(
+      scalars + i * msm_size, points_d + i * msm_size, msm_size, config, ref_d + i);
+  }
+
+  // config.are_results_on_device = false;
+  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+  // auto begin = std::chrono::high_resolution_clock::now();
+  // msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
  // test_reduce_triangle(scalars);
  // test_reduce_rectangle(scalars);
  // test_reduce_single(scalars);
  // test_reduce_var(scalars);
-  auto end = std::chrono::high_resolution_clock::now();
-  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
-  printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
+  // auto end = std::chrono::high_resolution_clock::now();
+  // auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
+  // printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);

+  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+
+  cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
+
  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

  // std::cout<<"final results batched large"<<std::endl;
-  // bool success = true;
-  // for (unsigned i = 0; i < batch_size; i++)
-  // {
-  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
-  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
-  //     std::cout<<"good"<<std::endl;
-  //   }
-  //   else{
-  //     std::cout<<"miss"<<std::endl;
-  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
-  //     success = false;
-  //   }
-  // }
-  // if (success){
-  //   std::cout<<"success!"<<std::endl;
-  // }
+  bool success = true;
+  for (unsigned i = 0; i < batch_size; i++) {
+    std::cout << test_projective::to_affine(res[i]) << std::endl;
+    if (test_projective::to_affine(res[i]) == test_projective::to_affine(ref[i])) {
+      std::cout << "good" << std::endl;
+    } else {
+      std::cout << "miss" << std::endl;
+      std::cout << test_projective::to_affine(ref[i]) << std::endl;
+      success = false;
+    }
+  }
+  if (success) { std::cout << "success!" << std::endl; }

  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
@@ -239,4 +292,4 @@ int main()
  // std::cout<<pr<<std::endl;

  return 0;
-}
+}
--- a/wrappers/golang/core/msm.go
+++ b/wrappers/golang/core/msm.go
@@ -116,13 +116,13 @@ func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfi
 	return scalars.AsUnsafePointer(), points.AsUnsafePointer(), results.AsUnsafePointer(), size, unsafe.Pointer(cfg)
 }

-func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
+func PrecomputePointsCheck(points HostOrDeviceSlice, cfg *MSMConfig, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
 	outputBasesLength, pointsLength := outputBases.Len(), points.Len()
-	if outputBasesLength != pointsLength*int(precomputeFactor) {
+	if outputBasesLength != pointsLength*int(cfg.PrecomputeFactor) {
 		errorString := fmt.Sprintf(
 			"Precompute factor is probably incorrect: expected %d but got %d",
 			outputBasesLength/pointsLength,
-			precomputeFactor,
+			cfg.PrecomputeFactor,
 		)
 		panic(errorString)
 	}
@@ -131,5 +131,8 @@ func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outp
 		points.(DeviceSlice).CheckDevice()
 	}

+	cfg.pointsSize = int32(pointsLength)
+	cfg.arePointsOnDevice = points.IsOnDevice()
+
 	return points.AsUnsafePointer(), outputBases.AsUnsafePointer()
 }
--- a/wrappers/golang/curves/bls12377/g2/include/msm.h
+++ b/wrappers/golang/curves/bls12377/g2/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_377_g2_msm_cuda(const scalar_t* scalars,const  g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
 cudaError_t bls12_377_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
+cudaError_t bls12_377_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12377/g2/msm.go
+++ b/wrappers/golang/curves/bls12377/g2/msm.go
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
 	return err
 }

+// Deprecated: G2PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// G2PrecomputePoints should be used instead.
 func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := G2GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.g2_affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.g2_affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_377_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12377/msm/include/msm.h
+++ b/wrappers/golang/curves/bls12377/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_377_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t bls12_377_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t bls12_377_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12377/msm/msm.go
+++ b/wrappers/golang/curves/bls12377/msm/msm.go
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_377_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBaseG2(t *testing.T) {
+func TestPrecomputePointsG2(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p g2.G2Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[g2.G2Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bls12377/tests/msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/msm_test.go
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleBls12_377.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleBls12_377.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bls12381/g2/include/msm.h
+++ b/wrappers/golang/curves/bls12381/g2/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_381_g2_msm_cuda(const scalar_t* scalars,const  g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
 cudaError_t bls12_381_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
+cudaError_t bls12_381_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12381/g2/msm.go
+++ b/wrappers/golang/curves/bls12381/g2/msm.go
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
 	return err
 }

+// Deprecated: G2PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// G2PrecomputePoints should be used instead.
 func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := G2GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.g2_affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.g2_affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_381_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12381/msm/include/msm.h
+++ b/wrappers/golang/curves/bls12381/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_381_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t bls12_381_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t bls12_381_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12381/msm/msm.go
+++ b/wrappers/golang/curves/bls12381/msm/msm.go
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_381_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12381/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bls12381/tests/g2_msm_test.go
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBaseG2(t *testing.T) {
+func TestPrecomputePointsG2(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p g2.G2Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[g2.G2Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bls12381/tests/msm_test.go
+++ b/wrappers/golang/curves/bls12381/tests/msm_test.go
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleBls12_381.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleBls12_381.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bn254/g2/include/msm.h
+++ b/wrappers/golang/curves/bn254/g2/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bn254_g2_msm_cuda(const scalar_t* scalars,const  g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
 cudaError_t bn254_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
+cudaError_t bn254_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bn254/g2/msm.go
+++ b/wrappers/golang/curves/bn254/g2/msm.go
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
 	return err
 }

+// Deprecated: G2PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// G2PrecomputePoints should be used instead.
 func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := G2GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.g2_affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.g2_affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
+
+	__ret := C.bn254_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bn254/msm/include/msm.h
+++ b/wrappers/golang/curves/bn254/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bn254_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t bn254_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t bn254_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bn254/msm/msm.go
+++ b/wrappers/golang/curves/bn254/msm/msm.go
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.bn254_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bn254/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bn254/tests/g2_msm_test.go
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBaseG2(t *testing.T) {
+func TestPrecomputePointsG2(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p g2.G2Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[g2.G2Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bn254/tests/msm_test.go
+++ b/wrappers/golang/curves/bn254/tests/msm_test.go
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleBn254.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleBn254.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bw6761/g2/include/msm.h
+++ b/wrappers/golang/curves/bw6761/g2/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bw6_761_g2_msm_cuda(const scalar_t* scalars,const  g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
 cudaError_t bw6_761_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
+cudaError_t bw6_761_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bw6761/g2/msm.go
+++ b/wrappers/golang/curves/bw6761/g2/msm.go
@@ -28,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
 	return err
 }

+// Deprecated: G2PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// G2PrecomputePoints should be used instead.
 func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := G2GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.g2_affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.g2_affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
+
+	__ret := C.bw6_761_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bw6761/msm/include/msm.h
+++ b/wrappers/golang/curves/bw6761/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bw6_761_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t bw6_761_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t bw6_761_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bw6761/msm/msm.go
+++ b/wrappers/golang/curves/bw6761/msm/msm.go
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.bw6_761_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bw6761/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bw6761/tests/g2_msm_test.go
@@ -170,9 +170,11 @@ func TestMSMG2Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBaseG2(t *testing.T) {
+func TestPrecomputePointsG2(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBaseG2(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p g2.G2Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[g2.G2Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bw6761/tests/msm_test.go
+++ b/wrappers/golang/curves/bw6761/tests/msm_test.go
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleBw6_761.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleBw6_761.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/grumpkin/msm/include/msm.h
+++ b/wrappers/golang/curves/grumpkin/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t grumpkin_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t grumpkin_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t grumpkin_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/grumpkin/msm/msm.go
+++ b/wrappers/golang/curves/grumpkin/msm/msm.go
@@ -28,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.grumpkin_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/grumpkin/tests/msm_test.go
+++ b/wrappers/golang/curves/grumpkin/tests/msm_test.go
@@ -64,9 +64,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -76,20 +78,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleGrumpkin.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleGrumpkin.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/internal/generator/msm/templates/msm.go.tmpl
+++ b/wrappers/golang/internal/generator/msm/templates/msm.go.tmpl
@@ -28,8 +28,13 @@ func {{.CurvePrefix}}Msm(scalars core.HostOrDeviceSlice, points core.HostOrDevic
 	return err
 }

+// Deprecated: {{.CurvePrefix}}PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// {{.CurvePrefix}}PrecomputePoints should be used instead.
 func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := {{.CurvePrefix}}GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -43,3 +48,16 @@ func {{.CurvePrefix}}PrecomputeBases(points core.HostOrDeviceSlice, precomputeFa
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func {{.CurvePrefix}}PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.{{toCName .CurvePrefix}}affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.{{toCName .CurvePrefix}}affine_t)(outputBasesPointer)
+
+	__ret := C.{{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/internal/generator/msm/templates/msm.h.tmpl
+++ b/wrappers/golang/internal/generator/msm/templates/msm.h.tmpl
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_msm_cuda(const scalar_t* scalars,const  {{toCName .CurvePrefix}}affine_t* points, int count, MSMConfig* config, {{toCName .CurvePrefix}}projective_t* out);
 cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_bases_cuda({{toCName .CurvePrefix}}affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, {{toCName .CurvePrefix}}affine_t* out);
+cudaError_t {{.Curve}}{{toCNameBackwards .CurvePrefix}}_precompute_msm_points_cuda({{toCName .CurvePrefix}}affine_t* points, int msm_size, MSMConfig* config, {{toCName .CurvePrefix}}affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/internal/generator/msm/templates/msm_test.go.tmpl
+++ b/wrappers/golang/internal/generator/msm/templates/msm_test.go.tmpl
@@ -251,9 +251,11 @@ func TestMSM{{.CurvePrefix}}Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {
+func TestPrecomputePoints{{.CurvePrefix}}(t *testing.T) {
 	cfg := {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -263,20 +265,18 @@ func TestPrecomputeBase{{.CurvePrefix}}(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p {{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = {{if eq .CurvePrefix "G2"}}g2{{else}}msm{{end}}.{{.CurvePrefix}}Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[{{if ne .CurvePrefix "G2"}}icicle{{capitalize .Curve}}{{else}}g2{{end}}.{{.CurvePrefix}}Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/rust/icicle-core/src/msm/mod.rs
+++ b/wrappers/rust/icicle-core/src/msm/mod.rs
@@ -105,6 +105,13 @@ pub trait MSM<C: Curve> {
        ctx: &DeviceContext,
        output_bases: &mut DeviceSlice<Affine<C>>,
    ) -> IcicleResult<()>;
+
+    fn precompute_points_unchecked(
+        points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
+        msm_size: i32,
+        cfg: &MSMConfig,
+        output_bases: &mut DeviceSlice<Affine<C>>,
+    ) -> IcicleResult<()>;
 }

 /// Computes the multi-scalar multiplication, or MSM: `s1*P1 + s2*P2 + ... + sn*Pn`, or a batch of several MSMs.
@@ -188,7 +195,7 @@ pub fn msm<C: Curve + MSM<C>>(
 /// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
 /// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
 ///
-/// * `bases` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+/// * `points` - Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
 ///
 /// * `precompute_factor` - The number of total precomputed points for each base (including the base itself).
 ///
@@ -201,6 +208,7 @@ pub fn msm<C: Curve + MSM<C>>(
 /// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
 ///
 /// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
+#[deprecated(since = "2.5.0", note = "Please use `precompute_points` instead")]
 pub fn precompute_bases<C: Curve + MSM<C>>(
    points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
    precompute_factor: i32,
@@ -220,6 +228,55 @@ pub fn precompute_bases<C: Curve + MSM<C>>(
    C::precompute_bases_unchecked(points, precompute_factor, _c, ctx, output_bases)
 }

+/// A function that precomputes MSM bases by extending them with their shifted copies.
+/// e.g.:
+/// Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
+/// Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
+/// 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
+///
+/// * `points` - Base points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+///
+/// * `msm_size` - The size of a single MSM to compute.
+///
+/// * `cfg` - config used to specify extra arguments of the MSM. NOTE: You should update the precompute_factor
+/// and potentially the c value inside this config. This config should be the same config used in msm.
+///
+/// * `output_bases` - Device-allocated buffer of size `bases_size` * `precompute_factor` for the extended bases.
+///
+/// Returns `Ok(())` if no errors occurred or a `CudaError` otherwise.
+pub fn precompute_points<C: Curve + MSM<C>>(
+    points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
+    msm_size: i32,
+    cfg: &MSMConfig,
+    output_bases: &mut DeviceSlice<Affine<C>>,
+) -> IcicleResult<()> {
+    let precompute_factor = cfg.precompute_factor;
+    assert_eq!(
+        output_bases.len(),
+        points.len() * (precompute_factor as usize),
+        "Precompute factor is probably incorrect: expected {} but got {}",
+        output_bases.len() / points.len(),
+        precompute_factor
+    );
+    assert!(output_bases.is_on_device());
+
+    let ctx_device_id = cfg
+        .ctx
+        .device_id;
+    if let Some(device_id) = points.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in points and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+    let mut local_cfg = cfg.clone();
+    local_cfg.points_size = points.len() as i32;
+    local_cfg.are_points_on_device = points.is_on_device();
+
+    C::precompute_points_unchecked(points, msm_size, &local_cfg, output_bases)
+}
+
 #[macro_export]
 macro_rules! impl_msm {
    (
@@ -250,6 +307,14 @@ macro_rules! impl_msm {
                    ctx: &DeviceContext,
                    output_bases: *mut Affine<$curve>,
                ) -> CudaError;
+
+                #[link_name = concat!($curve_prefix, "_precompute_msm_points_cuda")]
+                pub(crate) fn precompute_points_cuda(
+                    points: *const Affine<$curve>,
+                    msm_size: i32,
+                    cfg: &MSMConfig,
+                    output_bases: *mut Affine<$curve>,
+                ) -> CudaError;
            }
        }

@@ -292,6 +357,23 @@ macro_rules! impl_msm {
                    .wrap()
                }
            }
+
+            fn precompute_points_unchecked(
+                points: &(impl HostOrDeviceSlice<Affine<$curve>> + ?Sized),
+                msm_size: i32,
+                cfg: &MSMConfig,
+                output_bases: &mut DeviceSlice<Affine<$curve>>,
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $curve_prefix_indent::precompute_points_cuda(
+                        points.as_ptr(),
+                        msm_size,
+                        cfg,
+                        output_bases.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
        }
    };
 }
@@ -363,7 +445,7 @@ macro_rules! impl_msm_bench {
            group.sampling_mode(SamplingMode::Flat);
            group.sample_size(10);

-            use icicle_core::msm::precompute_bases;
+            use icicle_core::msm::precompute_points;
            use icicle_core::msm::tests::generate_random_affine_points_with_zeroes;
            use icicle_cuda_runtime::stream::CudaStream;

@@ -396,11 +478,11 @@ macro_rules! impl_msm_bench {
                let points = generate_random_affine_points_with_zeroes(test_size, 10);
                for precompute_factor in [1, 4, 8] {
                    let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
-                    precompute_bases(
+                    cfg.precompute_factor = precompute_factor as i32;
+                    precompute_points(
                        HostSlice::from_slice(&points),
-                        precompute_factor as i32,
-                        0,
-                        &cfg.ctx,
+                        test_size as i32,
+                        &cfg,
                        &mut precomputed_points_d,
                    )
                    .unwrap();
@@ -419,7 +501,6 @@ macro_rules! impl_msm_bench {
                        let scalars_h = HostSlice::from_slice(&scalars);

                        let mut msm_results = DeviceVec::<Projective<C>>::cuda_malloc(batch_size).unwrap();
-                        cfg.precompute_factor = precompute_factor as i32;

                        let bench_descr = format!(
                            " {} x {} with precomp = {:?}",
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -1,5 +1,5 @@
 use crate::curve::{Affine, Curve, Projective};
-use crate::msm::{msm, precompute_bases, MSMConfig, MSM};
+use crate::msm::{msm, precompute_points, MSMConfig, MSM};
 use crate::traits::{FieldImpl, GenerateRandom};
 use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
 use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
@@ -120,16 +120,16 @@ where
    cfg.is_async = true;
    cfg.large_bucket_factor = 5;
    cfg.c = 4;
+    let precompute_factor = 8;
    warmup(&stream).unwrap();
    for test_size in test_sizes {
-        let precompute_factor = 8;
        let points = generate_random_affine_points_with_zeroes(test_size, 10);
        let mut precomputed_points_d = DeviceVec::cuda_malloc(precompute_factor * test_size).unwrap();
-        precompute_bases(
+        cfg.precompute_factor = precompute_factor as i32;
+        precompute_points(
            HostSlice::from_slice(&points),
-            precompute_factor as i32,
-            0,
-            &cfg.ctx,
+            test_size as i32,
+            &cfg,
            &mut precomputed_points_d,
        )
        .unwrap();