two curve examples

2026-01-13 01:17:57 -05:00 · 2024-07-17 09:19:37 +03:00
9 changed files with 231 additions and 110 deletions
--- a/docs/docs/icicle/core.md
+++ b/docs/docs/icicle/core.md
@@ -34,24 +34,6 @@ The Core is split into logical modules that can be compiled into static librarie
 | --- | :---: |
 | Keccak | 256, 512 |

-## Enhanced MSM Support
-
-Since v2.1.0, ICICLE introduces enhanced support for various window sizes in Multi-Scalar Multiplication (MSM). This feature allows for optimized performance by adjusting the window size based on the application's needs.
-
-### Example: Enhanced MSM Support in Rust
-```rust
-extern crate icicle;
-
-fn main() {
-    let msm = icicle::Msm::new();
-    let result = msm.compute_with_window_size(8);
-    println!("{:?}", result);
-}
-```
-
-This example demonstrates how to initialize the MSM module and perform a computation with a specified window size. The **compute_with_window_size** function allows for optimized performance by adjusting the window size based on the application's needs.
-
-
 ## Compilation strategies

 Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields, you compile ICICLE once per curve or field that is needed. It's that simple. Currently, the following choices are supported:
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -134,25 +134,3 @@ Replace `/path/to/shared/libs` with the actual path where the shared libraries a
 | Polynomials | ✅ |
 | NTT | ✅ |
 | Extension Field | ✅ |
-
-## Poseidon API
-
-Since v2.2.0, ICICLE introduces the Poseidon hash function in the Golang bindings. This update allows developers to utilize Poseidon for efficient cryptographic hashing in their Go applications.
-
-package main
-
-import (
-    "fmt"
-    "github.com/ingonyama-zk/icicle"
-)
-
-func main() {
-    hash := icicle.NewPoseidonHash()
-    result := hash.Hash([]byte("example data"))
-    fmt.Println(result)
-}
-
-### Explanation:
-
-This example shows how to create a new Poseidon hash instance and use it to hash a byte array. The NewPoseidonHash function initializes the hash, and the Hash method computes the hash of the input data.
-
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -51,24 +51,6 @@ The Polynomial class encapsulates a polynomial, providing a variety of operation
 - **Manipulation**: Features like slicing polynomials, adding or subtracting monomials inplace, and computing polynomial degrees.
 - **Memory Access**: Access internal states or obtain device-memory views of polynomials.

-## Polynomial API Improvements
-
-Since v2.3.0, ICICLE includes various fixes and performance enhancements for the Polynomial API, making it more robust and efficient for polynomial operations.
-
-### Example: Polynomial API Improvements in C++
-```cpp
-#include <icicle/polynomial.h>
-
-void improved_polynomial() {
-    icicle::Polynomial p;
-    p.coefficients = {4, 5, 6}; // p(x) = 6x^2 + 5x + 4
-    p.print();
-}
-```
-
-### Explanation
-This example illustrates how to define and print a polynomial using the improved Polynomial API. The coefficients are set, and the polynomial is printed to the console.
-
 ## Usage

 This section outlines how to use the Polynomial API in C++. Bindings for Rust and Go are detailed under the Bindings sections.
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -19,5 +19,5 @@ add_executable(
  example.cu
 )
 target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bls12_377.a)
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -10,6 +10,9 @@ mkdir -p build/icicle
 cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
 cmake --build build/icicle

+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377 -DG2=ON
+cmake --build build/icicle
+
 # Configure and build the example application
 cmake -S . -B build/example
 cmake --build build/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -3,9 +3,23 @@
 #include <iomanip>

 #include "api/bn254.h"
-using namespace bn254;
+#include "api/bls12_377.h"

-int main(int argc, char* argv[])
+// using namespace bn254;
+typedef bn254::scalar_t scalar_bn254;
+typedef bn254::affine_t affine_bn254;
+typedef bn254::g2_affine_t g2_affine_bn254;
+typedef bn254::projective_t projective_bn254;
+typedef bn254::g2_projective_t g2_projective_bn254;
+
+typedef bls12_377::scalar_t scalar_bls12377;
+typedef bls12_377::affine_t affine_bls12377;
+typedef bls12_377::g2_affine_t g2_affine_bls12377;
+typedef bls12_377::projective_t projective_bls12377;
+typedef bls12_377::g2_projective_t g2_projective_bls12377;
+
+
+int msm_bn254(int argc, char* argv[])
 {
  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
  std::cout << "Example parameters" << std::endl;
@@ -18,11 +32,11 @@ int main(int argc, char* argv[])
  std::cout << "Part I: use G1 points" << std::endl;

  std::cout << "Generating random inputs on-host" << std::endl;
-  scalar_t* scalars = new scalar_t[N];
-  affine_t* points = new affine_t[N];
-  projective_t result;
-  scalar_t::rand_host_many(scalars, N);
-  projective_t::rand_host_many_affine(points, N);
+  scalar_bn254* scalars = new scalar_bn254[N];
+  affine_bn254* points = new affine_bn254[N];
+  projective_bn254 result;
+  scalar_bn254::rand_host_many(scalars, N);
+  projective_bn254::rand_host_many_affine(points, N);

  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
  device_context::DeviceContext ctx = device_context::get_default_device_context();
@@ -48,17 +62,17 @@ int main(int argc, char* argv[])
  cudaStream_t stream = config.ctx.stream;
  // Execute the MSM kernel
  bn254_msm_cuda(scalars, points, msm_size, config, &result);
-  std::cout << projective_t::to_affine(result) << std::endl;
+  std::cout << projective_bn254::to_affine(result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
-  scalar_t* scalars_d;
-  affine_t* points_d;
-  projective_t* result_d;
-  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
-  cudaMalloc(&points_d, sizeof(affine_t) * N);
-  cudaMalloc(&result_d, sizeof(projective_t));
-  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
+  scalar_bn254* scalars_d;
+  affine_bn254* points_d;
+  projective_bn254* result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
+  cudaMalloc(&points_d, sizeof(affine_bn254) * N);
+  cudaMalloc(&result_d, sizeof(projective_bn254));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(affine_bn254) * N, cudaMemcpyHostToDevice);

  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
  config.are_results_on_device = true;
@@ -70,9 +84,9 @@ int main(int argc, char* argv[])
  bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);

  // Copy the result back to the host
-  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
+  cudaMemcpy(&result, result_d, sizeof(projective_bn254), cudaMemcpyDeviceToHost);
  // Print the result
-  std::cout << projective_t::to_affine(result) << std::endl;
+  std::cout << projective_bn254::to_affine(result) << std::endl;
  // Free the device memory
  cudaFree(scalars_d);
  cudaFree(points_d);
@@ -84,25 +98,25 @@ int main(int argc, char* argv[])

  std::cout << "Generating random inputs on-host" << std::endl;
  // use the same scalars
-  g2_affine_t* g2_points = new g2_affine_t[N];
-  g2_projective_t::rand_host_many_affine(g2_points, N);
+  g2_affine_bn254* g2_points = new g2_affine_bn254[N];
+  g2_projective_bn254::rand_host_many_affine(g2_points, N);

  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
  config.are_results_on_device = false;
  config.are_scalars_on_device = false;
  config.are_points_on_device = false;
-  g2_projective_t g2_result;
+  g2_projective_bn254 g2_result;
  bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
-  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+  std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
-  g2_affine_t* g2_points_d;
-  g2_projective_t* g2_result_d;
-  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
-  cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
-  cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
-  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
+  g2_affine_bn254* g2_points_d;
+  g2_projective_bn254* g2_result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_bn254) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_bn254));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bn254) * N, cudaMemcpyHostToDevice);

  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
  config.are_results_on_device = true;
@@ -111,14 +125,140 @@ int main(int argc, char* argv[])

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
  bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
-  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
-  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;

  cudaFree(scalars_d);
  cudaFree(g2_points_d);
  cudaFree(g2_result_d);
  delete[] g2_points;
  delete[] scalars;
-  cudaStreamDestroy(stream);
+  return 0;
+}
+
+int msm_bls12_377(int argc, char* argv[])
+{
+  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  int batch_size = 1;
+  std::cout << "Batch size: " << batch_size << std::endl;
+  unsigned msm_size = 1048576;
+  std::cout << "MSM size: " << msm_size << std::endl;
+  int N = batch_size * msm_size;
+
+  std::cout << "Part I: use G1 points" << std::endl;
+
+  std::cout << "Generating random inputs on-host" << std::endl;
+  scalar_bls12377* scalars = new scalar_bls12377[N];
+  affine_bls12377* points = new affine_bls12377[N];
+  projective_bls12377 result;
+  scalar_bls12377::rand_host_many(scalars, N);
+  projective_bls12377::rand_host_many_affine(points, N);
+
+  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  msm::MSMConfig config = {
+    ctx,   // ctx
+    0,     // points_size
+    1,     // precompute_factor
+    0,     // c
+    0,     // bitsize
+    10,    // large_bucket_factor
+    1,     // batch_size
+    false, // are_scalars_on_device
+    false, // are_scalars_montgomery_form
+    false, // are_points_on_device
+    false, // are_points_montgomery_form
+    false, // are_results_on_device
+    false, // is_big_triangle
+    false, // is_async
+  };
+  config.batch_size = batch_size;
+
+  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
+  cudaStream_t stream = config.ctx.stream;
+  cudaStreamCreate(&stream);
+  // Execute the MSM kernel
+  bls12_377_msm_cuda(scalars, points, msm_size, config, &result);
+  std::cout << projective_bls12377::to_affine(result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  scalar_bls12377* scalars_d_bls;
+  affine_bls12377* points_d_bls;
+  projective_bls12377* result_d_bls;
+  cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
+  cudaMalloc(&points_d_bls, sizeof(affine_bls12377) * N);
+  cudaMalloc(&result_d_bls, sizeof(projective_bls12377));
+  cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d_bls, points, sizeof(affine_bls12377) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  // Execute the MSM kernel
+  bls12_377_msm_cuda(scalars_d_bls, points_d_bls, msm_size, config, result_d_bls);
+
+  // Copy the result back to the host
+  cudaMemcpy(&result, result_d_bls, sizeof(projective_bls12377), cudaMemcpyDeviceToHost);
+  // Print the result
+  std::cout << projective_bls12377::to_affine(result) << std::endl;
+  // Free the device memory
+  cudaFree(scalars_d_bls);
+  cudaFree(points_d_bls);
+  cudaFree(result_d_bls);
+  // Free the host memory, keep scalars for G2 example
+  delete[] points;
+
+  std::cout << "Part II: use G2 points" << std::endl;
+
+  std::cout << "Generating random inputs on-host" << std::endl;
+  // use the same scalars
+  g2_affine_bls12377* g2_points = new g2_affine_bls12377[N];
+  g2_projective_bls12377::rand_host_many_affine(g2_points, N);
+
+  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
+  config.are_results_on_device = false;
+  config.are_scalars_on_device = false;
+  config.are_points_on_device = false;
+  g2_projective_bls12377 g2_result;
+  bls12_377_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
+  std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  g2_affine_bls12377* g2_points_d;
+  g2_projective_bls12377* g2_result_d;
+  cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_bls12377) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_bls12377));
+  cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bls12377) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  bls12_377_g2_msm_cuda(scalars_d_bls, g2_points_d, msm_size, config, g2_result_d);
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
+
+  cudaFree(scalars_d_bls);
+  cudaFree(g2_points_d);
+  cudaFree(g2_result_d);
+  delete[] g2_points;
+  delete[] scalars;
+  return 0;
+}
+
+int main(int argc, char* argv[])
+{ 
+  std::cout << "Starting BN254 MSM" << std::endl;
+  msm_bn254(argc, argv);
+  std::cout << "Starting BLS12-377 MSM" << std::endl;
+  msm_bls12_377(argc, argv);
  return 0;
 }
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -18,7 +18,7 @@ add_executable(
  example.cu
 )
 target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bls12_377.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -7,9 +7,13 @@ mkdir -p build/example
 mkdir -p build/icicle

 # Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377
+cmake --build build/icicle
+rm build/icicle/CMakeCache.txt 
 cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
 cmake --build build/icicle

+
 # Configure and build the example application
 cmake -S . -B build/example
 cmake --build build/example
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -4,14 +4,17 @@
 #include <nvml.h>

 #include "api/bn254.h"
+#include "api/bls12_377.h"
 #include "vec_ops/vec_ops.cuh"

 using namespace vec_ops;
-using namespace bn254;
+// using namespace bn254;
+typedef bn254::scalar_t T;

-typedef scalar_t T;
+typedef  bls12_377::scalar_t T_bls;

-int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
+
+int vector_mult_bn254(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
  config.is_a_on_device = true;
@@ -25,10 +28,24 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont
  return 0;
 }

+int vector_mult_bls12377(T_bls* vec_b, T_bls* vec_a, T_bls* vec_result, size_t n_elments, device_context::DeviceContext ctx)
+{
+  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
+  config.is_a_on_device = true;
+  config.is_b_on_device = true;
+  config.is_result_on_device = true;
+  cudaError_t err = bls12_377_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+
 int main(int argc, char** argv)
 {
  const unsigned vector_size = 1 << 15;
-  const unsigned repetitions = 1 << 15;
+  const unsigned repetitions = 1 ;

  cudaError_t err;
  nvmlInit();
@@ -61,41 +78,53 @@ int main(int argc, char** argv)
  // host data
  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
+  T_bls* host_in1_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
+  T_bls* host_in2_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
  std::cout << "Initializing vectors with random data" << std::endl;
  T::rand_host_many(host_in1, vector_size);
  T::rand_host_many(host_in2, vector_size);
+  T_bls::rand_host_many(host_in1_bls12377, vector_size);
+  T_bls::rand_host_many(host_in2_bls12377, vector_size);
  // device data
  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  T* device_in1;
-  T* device_in2;
-  T* device_out;
+  T* device_in1_bn254;
+  T* device_in2_bn254;
+  T* device_out_bn254;
+  T_bls* device_in1_bls12377;
+  T_bls* device_in2_bls12377;
+  T_bls* device_out_bls12377;

-  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_in1_bn254, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_in1_bls12377, vector_size * sizeof(T_bls));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_in2_bn254, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_in2_bls12377, vector_size * sizeof(T_bls));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_out_bn254, vector_size * sizeof(T));
+  err = cudaMalloc((void**)&device_out_bls12377, vector_size * sizeof(T_bls));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

  // copy from host to device
-  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in1_bn254, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in1_bls12377, host_in1_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in2_bn254, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in2_bls12377, host_in2_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -104,7 +133,10 @@ int main(int argc, char** argv)
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
-    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+    std::cout << "bn254 mult" << std::endl;
+    vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
+    std::cout << "bls12-377 mult" << std::endl;
+    vector_mult_bls12377(device_in1_bls12377, device_in2_bls12377, device_out_bls12377, vector_size, ctx);
  }

  std::cout << "Starting benchmarking" << std::endl;
@@ -122,7 +154,7 @@ int main(int argc, char** argv)
  auto start_time = std::chrono::high_resolution_clock::now();
  // Benchmark loop
  for (int i = 0; i < repetitions; i++) {
-    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+    vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
  }
  auto end_time = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
@@ -146,7 +178,7 @@ int main(int argc, char** argv)
  // Optional: validate multiplication
  T* host_out = (T*)malloc(vector_size * sizeof(T));

-  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+  cudaMemcpy(host_out, device_out_bn254, vector_size * sizeof(T), cudaMemcpyDeviceToHost);

  // validate multiplication here...

@@ -154,9 +186,9 @@ int main(int argc, char** argv)
  free(host_in1);
  free(host_in2);
  free(host_out);
-  cudaFree(device_in1);
-  cudaFree(device_in2);
-  cudaFree(device_out);
+  cudaFree(device_in1_bn254);
+  cudaFree(device_in2_bn254);
+  cudaFree(device_out_bn254);
  nvmlShutdown();
  return 0;
 }