no device sync

works without the sort - temp fix
fixing streams - WIP
2026-01-14 09:58:02 -05:00 · 2023-07-05 11:07:07 +03:00 · 2023-07-05 10:55:59 +03:00 · 2023-07-05 09:50:04 +03:00 · 2023-07-02 16:13:51 +03:00 · 2023-07-02 14:49:19 +03:00
10 changed files with 1146 additions and 94 deletions
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -3,7 +3,7 @@
 #pragma once

 template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);
+void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream);

 template <typename S, typename P, typename A>
 void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
@@ -12,7 +12,7 @@ template <typename S, typename P, typename A>
 void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);

 template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);
+void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream);

 template <typename S, typename P, typename A>
 void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -5,15 +5,19 @@
 #include "../../utils/cuda_utils.cuh"
 #include "../../primitives/projective.cuh"
 #include "../../primitives/field.cuh"
-#include "../../curves/bls12_381/curve_config.cuh"
+// #include "../../curves/bls12_377/curve_config.cuh"
+#include "../../curves/bn254/curve_config.cuh"

-using namespace BLS12_381;
+// using namespace BLS12_377;
+using namespace BN254;

 class Dummy_Scalar {
  public:
    static constexpr unsigned NBITS = 32;

    unsigned x;
+    // unsigned p = 10;
+    unsigned p = 1<<30;

    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) {
      os << scalar.x;
@@ -25,7 +29,7 @@ class Dummy_Scalar {
    }

    friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) {   
-      return {p1.x+p2.x};
+      return {(p1.x+p2.x)%p1.p};
    }

    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) {
@@ -36,10 +40,11 @@ class Dummy_Scalar {
      return (p1.x == p2);
    }

-    // static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
+    static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
+      return {scalar.p-scalar.x};
+    }
    static HOST_INLINE Dummy_Scalar rand_host() {
+      // return {(unsigned)rand()%10};
      return {(unsigned)rand()};
    }
 };
@@ -53,6 +58,10 @@ class Dummy_Projective {
      return {0};
    }

+    static HOST_DEVICE_INLINE Dummy_Projective one() {
+      return {1};
+    }
+
    static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) {
      return {point.x};
    }
@@ -61,9 +70,9 @@ class Dummy_Projective {
      return {point.x};
    }

-    // static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
+    static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
+      return {Dummy_Scalar::neg(point.x)};
+    }

    friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) {   
      return {p1.x+p2.x};
@@ -119,62 +128,68 @@ typedef affine_t test_affine;

 int main()
 {
-  unsigned batch_size = 4;
-  unsigned msm_size = 1<<15;
+  unsigned batch_size = 1;
+  unsigned msm_size = 1<<24;
  unsigned N = batch_size*msm_size;

  test_scalar *scalars = new test_scalar[N];
  test_affine *points = new test_affine[N];
  
  for (unsigned i=0;i<N;i++){
-    scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
+    // scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
    points[i] = (i%msm_size < 10)? test_projective::to_affine(test_projective::rand_host()): points[i-10];
-    // scalars[i] = test_scalar::rand_host();
+    scalars[i] = test_scalar::rand_host();
    // points[i] = test_projective::to_affine(test_projective::rand_host());
  }
  std::cout<<"finished generating"<<std::endl;

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[batch_size];
-  test_projective batched_large_res[batch_size];
+  test_projective large_res[batch_size*2];
+  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];


  // short_msm<scalar_t, projective_t, affine_t>(scalars, points, N, short_res);
-  for (unsigned i=0;i<batch_size;i++){
-    large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
+  // for (unsigned i=0;i<batch_size;i++){
+    // large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
    // std::cout<<"final result large"<<std::endl;
    // std::cout<<test_projective::to_affine(*large_res)<<std::endl;
-  }
+  // }
  auto begin = std::chrono::high_resolution_clock::now();
-  batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
-  // large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false);
+  // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
+  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false, true,0);
+  // std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res+1, false, false,0);
+  // test_reduce_triangle(scalars);
+  // test_reduce_rectangle(scalars);
+  // test_reduce_single(scalars);
  auto end = std::chrono::high_resolution_clock::now();
  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
  printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
  std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  std::cout<<test_projective::to_affine(large_res[1])<<std::endl;

  // reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

-  std::cout<<"final results batched large"<<std::endl;
-  bool success = true;
-  for (unsigned i = 0; i < batch_size; i++)
-  {
-    std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
-    if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
-      std::cout<<"good"<<std::endl;
-    }
-    else{
-      std::cout<<"miss"<<std::endl;
-      std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
-      success = false;
-    }
-  }
-  if (success){
-    std::cout<<"success!"<<std::endl;
-  }
+  // std::cout<<"final results batched large"<<std::endl;
+  // bool success = true;
+  // for (unsigned i = 0; i < batch_size; i++)
+  // {
+  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
+  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
+  //     std::cout<<"good"<<std::endl;
+  //   }
+  //   else{
+  //     std::cout<<"miss"<<std::endl;
+  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
+  //     success = false;
+  //   }
+  // }
+  // if (success){
+  //   std::cout<<"success!"<<std::endl;
+  // }
  
  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
--- a/icicle/curves/bls12_377/msm.cu
+++ b/icicle/curves/bls12_377/msm.cu
@@ -12,7 +12,7 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[
 {
    try
    {
-        large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, false, stream);
        return CUDA_SUCCESS;
    }
    catch (const std::runtime_error &ex)
@@ -53,7 +53,7 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
 {
     try
     {
-         large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, stream);
+         large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, false, stream);
         cudaStreamSynchronize(stream);
         return 0;
     }
--- a/icicle/curves/bls12_381/msm.cu
+++ b/icicle/curves/bls12_381/msm.cu
@@ -12,7 +12,7 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[
 {
    try
    {
-        large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, false, stream);
        return CUDA_SUCCESS;
    }
    catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:
 {
     try
     {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
+         large_msm(d_scalars, d_points, count, d_out, true, false, stream);
         cudaStreamSynchronize(stream);
         return 0;
     }
--- a/icicle/curves/bls12_381/supported_operations.cu
+++ b/icicle/curves/bls12_381/supported_operations.cu
@@ -2,4 +2,4 @@
 #include "lde.cu"
 #include "msm.cu"
 #include "ve_mod_mult.cu"
-#include "poseidon.cu"
+#include "poseidon.cu"
--- a/icicle/curves/bn254/msm.cu
+++ b/icicle/curves/bn254/msm.cu
@@ -12,7 +12,7 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
 {
    try
    {
-        large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, false, stream);
        return CUDA_SUCCESS;
    }
    catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po
 {
     try
     {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
+         large_msm(d_scalars, d_points, count, d_out, true, false, stream);
         cudaStreamSynchronize(stream);
         return 0;
     }
--- a/icicle/curves/bn254/projective.cu
+++ b/icicle/curves/bn254/projective.cu
@@ -16,4 +16,4 @@ extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective
  !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) && 
  !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero()));
 }
-#endif
+#endif
--- a/icicle/curves/curve_template/msm.cu
+++ b/icicle/curves/curve_template/msm.cu
@@ -11,7 +11,7 @@ int msm_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *out, ${CURVE_NAME_U}
 {
    try
    {
-        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, false, stream);
        return CUDA_SUCCESS;
    }
    catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* out
 {
     try
     {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
+         large_msm(d_scalars, d_points, count, d_out, true, false, stream);
         cudaStreamSynchronize(stream);
         return 0;
     }
--- a/icicle/curves/curve_template/projective.cu
+++ b/icicle/curves/curve_template/projective.cu
@@ -16,4 +16,4 @@ extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1,
  !((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) && 
  !((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero()));
 }
-#endif
+#endif
Author	SHA1	Message	Date
hadaringonyama	b39b529463	no device sync	2023-07-05 11:07:07 +03:00
hadaringonyama	86bee3af42	works without the sort - temp fix	2023-07-05 10:55:59 +03:00
hadaringonyama	dbd5cd4cbb	fixing streams - WIP	2023-07-05 09:50:04 +03:00
hadaringonyama	655f014dc2	merge with dev passes all tests	2023-07-02 16:13:51 +03:00
hadaringonyama	18c7cad89c	Merge remote-tracking branch 'origin/dev' into msm-performance	2023-07-02 14:49:19 +03:00
hadaringonyama	1e9f628235	c=16 works	2023-07-02 12:07:46 +03:00
hadaringonyama	2e45ed1bd4	reduction - simple implementation, c=8 works	2023-06-29 19:12:53 +03:00
hadaringonyama	e828d1da2a	c=8 works	2023-06-28 16:23:13 +03:00
hadaringonyama	233927668c	triangle block mix fix	2023-06-28 12:31:38 +03:00
guy-ingo	1866df60f1	new kernels implementation, no correctness	2023-06-26 18:25:38 +03:00
guy-ingo	ccc8892a52	rectangle kernel works for powers of 2	2023-06-26 17:56:02 +03:00
guy-ingo	67e4ee2864	triangle kernel works as expected	2023-06-26 16:51:42 +03:00
guy-ingo	6c5fe47e55	big triangle replacment works	2023-06-25 15:28:56 +03:00
hadaringonyama	ed9de3d1e9	signed digits working	2023-06-20 14:48:13 +03:00
hadaringonyama	d01e0dbfb1	signed + top bm - WIP	2023-06-18 18:10:11 +03:00
hadaringonyama	6aa6fe0c1c	adding sort by bucket size	2023-06-18 11:27:07 +03:00
hadaringonyama	a64df640de	temp	2023-05-30 09:37:15 +03:00
hadaringonyama	1b2b9f2826	code cleaning	2023-05-29 09:22:10 +03:00
hadaringonyama	0a36a545bf	remove short msm from extern call	2023-05-28 16:24:04 +03:00
hadaringonyama	407273dee3	bugs fixed	2023-05-28 16:13:47 +03:00
hadaringonyama	f55bd30e13	bugs fixed	2023-05-28 15:53:02 +03:00
HadarIngonyama	071c24ce5a	supporting new curves (#74 ) * Fix for local machines GoogleTest and CMake (#70) GoogleTest fix, updated readme * Supporting Additional Curves (#72) * init commit - changes for supporting new curves * refactor + additional curve (bls12-377 works, bn254 - not yet) * general refactor + curves script + fixing bn245 * revert unnecessary changes + refactor new curve script * add README and fix limbs_p=limbs_q case in python script --------- Co-authored-by: Vitalii Hnatyk <vhnatyk@gmail.com> Co-authored-by: guy-ingo <106763145+guy-ingo@users.noreply.github.com>	2023-05-15 15:51:48 +03:00
DmytroTym	08c34a5183	Fix for local machines GoogleTest and CMake (#70 ) (#73 ) GoogleTest fix, updated readme Co-authored-by: Vitalii Hnatyk <vhnatyk@gmail.com>	2023-05-15 15:23:06 +03:00