added G2 to MSM example

Merge remote-tracking branch 'origin/develop/dima/g2_and_bw_scalar_field' into update-examples-for-new-API
test-threads in CI and msm check corrected
2026-01-14 01:47:59 -05:00 · 2024-01-23 17:11:21 -05:00 · 2024-01-23 12:05:09 -05:00 · 2024-01-23 16:06:24 +02:00 · 2024-01-23 14:38:47 +02:00 · 2024-01-23 14:12:25 +02:00
59 changed files with 883 additions and 528 deletions
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -50,8 +50,8 @@ jobs:
      working-directory: ./wrappers/rust
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      # Running tests from the root workspace will run all workspace members' tests by default
-      #TODO: remove test-threads once thread safety is finalized
-      run: cargo test --release --verbose -- --test-threads=1
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --release --verbose --features=g2 -- --test-threads=2

  test-cpp-linux:
    name: Test C++ on Linux
--- a/README.md
+++ b/README.md
@@ -1,11 +1,10 @@
 # ICICLE
- **<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**

-                  
+**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**
+
 <p align="center">
  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
 </p>
-
 <p align="center">
  <a href="https://discord.gg/EVVXTdt6DF">
    <img src="https://img.shields.io/discord/1063033227788423299?logo=discord" alt="Chat with us on Discord">
@@ -13,6 +12,7 @@
  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
  </a>
+  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
 </p>

 ## Background
--- a/examples/c++/Poseidon-hash/CMakeLists.txt
+++ b/examples/c++/Poseidon-hash/CMakeLists.txt
@@ -14,7 +14,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
-include_directories("/icicle/icicle")
+include_directories("../../.." "/icicle" "/opt/icicle")
 add_executable(
  example
  example.cu
--- a/examples/c++/Poseidon-hash/README.md
+++ b/examples/c++/Poseidon-hash/README.md
@@ -41,6 +41,12 @@ poseidon.hash_blocks(inBlocks, nBlocks, outHashes, hashType, stream);

 - **HashType:** In this example we use `Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree`.

+## Running the example
+
+- `cd` to your example directory
+- compile with `./compile.sh`
+- run with `./run.sh`
+
 ## What's in the example

 1. Define the size of the example: the height of the full binary Merkle tree. 
--- a/examples/c++/Poseidon-hash/compile.sh
+++ b/examples/c++/Poseidon-hash/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/Poseidon-hash/example.cu
+++ b/examples/c++/Poseidon-hash/example.cu
@@ -3,148 +3,143 @@
 #include <iostream>

 // select the curve
-#include "curves/bls12_381/curve_config.cuh"
-// expose Poseidon classes
-#include "curves/bls12_381/poseidon.cu"
+#define CURVE_ID 1
+// include Poseidon template
+#include "icicle/appUtils/poseidon/poseidon.cu"
+using namespace curve_config;

-// location of a tree node in the array for a given level and offset
-inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }

-// We assume the tree has leaves already set, compute all other levels
-void build_tree(
-  const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t>& poseidon, cudaStream_t stream)
-{
-  for (uint32_t level = tree_height - 1; level > 0; level--) {
-    const uint32_t next_level = level - 1;
-    const uint32_t next_level_width = 1 << next_level;
-    poseidon.hash_blocks(
-      &tree[tree_index(level, 0)], next_level_width, &tree[tree_index(next_level, 0)],
-      Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
-  }
-}
-
-// search leaves for a given hash, return offset
-uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height)
-{
-  const uint32_t tree_width = (1 << (tree_height - 1));
-  for (uint32_t i = 0; i < tree_width; i++) {
-    const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
-    if (leaf == query) {
-      return i; // found the hash
-    }
-  }
-  return tree_height; // hash not found
-}
-
-void generate_proof(
-  uint32_t position,
-  BLS12_381::scalar_t* tree,
-  const uint32_t tree_height,
-  uint32_t* proof_lr,
-  BLS12_381::scalar_t* proof_hash)
-{
-  uint32_t level_index = position;
-  for (uint32_t level = tree_height - 1; level > 0; level--) {
-    uint32_t lr;
-    uint32_t neighbour_index;
-    lr = level_index % 2;
-    if (lr == 0) {
-      // left
-      neighbour_index = level_index + 1;
-    } else {
-      // right
-      neighbour_index = level_index - 1;
-    }
-    proof_lr[level] = lr;
-    proof_hash[level] = tree[tree_index(level, neighbour_index)];
-    level_index /= 2;
-  }
-  // the proof must match this:
-  proof_hash[0] = tree[tree_index(0, 0)];
-}
-
-uint32_t validate_proof(
-  const BLS12_381::scalar_t hash,
-  const uint32_t tree_height,
-  const uint32_t* proof_lr,
-  const BLS12_381::scalar_t* proof_hash,
-  Poseidon<BLS12_381::scalar_t>& poseidon,
-  cudaStream_t stream)
-{
-  BLS12_381::scalar_t hashes_in[2], hash_out[1], level_hash;
-  level_hash = hash;
-  for (uint32_t level = tree_height - 1; level > 0; level--) {
-    if (proof_lr[level] == 0) {
-      hashes_in[0] = level_hash;
-      hashes_in[1] = proof_hash[level];
-    } else {
-      hashes_in[0] = proof_hash[level];
-      hashes_in[1] = level_hash;
-    }
-    // next level hash
-    poseidon.hash_blocks(hashes_in, 1, hash_out, Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
-    level_hash = hash_out[0];
-  }
-  return proof_hash[0] == level_hash;
-}
-
-int main(int argc, char* argv[])
-{
-  std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
-  const uint32_t tree_height = 21;
-  std::cout << "Tree height: " << tree_height << std::endl;
-  const uint32_t tree_arity = 2;
-  const uint32_t leaf_level = tree_height - 1;
-  const uint32_t tree_width = 1 << leaf_level;
-  std::cout << "Tree width: " << tree_width << std::endl;
-  const uint32_t tree_size = (1 << tree_height) - 1;
-  std::cout << "Tree size: " << tree_size << std::endl;
-  BLS12_381::scalar_t* tree = static_cast<BLS12_381::scalar_t*>(malloc(tree_size * sizeof(BLS12_381::scalar_t)));
-
-  std::cout << "2. Hashing blocks in parallel" << std::endl;
-  const uint32_t data_arity = 4;
-  std::cout << "Block size (arity): " << data_arity << std::endl;
-  std::cout << "Initializing blocks..." << std::endl;
-  BLS12_381::scalar_t d = BLS12_381::scalar_t::zero();
-  BLS12_381::scalar_t* data =
-    static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
-  for (uint32_t i = 0; i < tree_width * data_arity; i++) {
-    data[i] = d;
-    d = d + BLS12_381::scalar_t::one();
-  }
-
-  std::cout << "Hashing blocks into tree leaves..." << std::endl;
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  Poseidon<BLS12_381::scalar_t> data_poseidon(data_arity, stream);
-  data_poseidon.hash_blocks(
-    data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
-
-  std::cout << "3. Building Merkle tree" << std::endl;
-  Poseidon<BLS12_381::scalar_t> tree_poseidon(tree_arity, stream);
-  build_tree(tree_height, tree, tree_poseidon, stream);
-
-  std::cout << "4. Generate membership proof" << std::endl;
-  uint32_t position = tree_width - 1;
-  std::cout << "Using the hash for block: " << position << std::endl;
-  BLS12_381::scalar_t query = tree[tree_index(leaf_level, position)];
-  uint32_t query_position = query_membership(query, tree, tree_height);
-  // allocate arrays for the proof
-  uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
-  BLS12_381::scalar_t* proof_hash =
-    static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
-  generate_proof(query_position, tree, tree_height, proof_lr, proof_hash);
-
-  std::cout << "5. Validate the hash membership" << std::endl;
-  uint32_t validated;
-  const BLS12_381::scalar_t hash = tree[tree_index(leaf_level, query_position)];
-  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
-  std::cout << "Validated: " << validated << std::endl;
-
-  std::cout << "6. Tamper the hash" << std::endl;
-  const BLS12_381::scalar_t tampered_hash = hash + BLS12_381::scalar_t::one();
-  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);
-  std::cout << "7. Invalidate tamper hash membership" << std::endl;
-  std::cout << "Validated: " << validated << std::endl;
+int main(int argc, char* argv[]) {
  return 0;
 }
+
+
+// // location of a tree node in the array for a given level and offset
+
+// inline uint32_t tree_index(uint32_t level, uint32_t offset) {
+//     return (1 << level) - 1 + offset;
+// }
+
+// // We assume the tree has leaves already set, compute all other levels
+// void build_tree(const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, cudaStream_t stream) {
+//   for (uint32_t level = tree_height-1; level>0 ; level-- ) {
+//     const uint32_t next_level = level -1;
+//     const uint32_t next_level_width = 1 << next_level;
+//     poseidon.hash_blocks(&tree[tree_index(level,0)], next_level_width, &tree[tree_index(next_level,0)], Poseidon<scalar_t>::HashType::MerkleTree, stream);
+//   }  
+// }
+
+// // search leaves for a given hash, return offset
+// uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_height) {
+//   const uint32_t tree_width = (1 << (tree_height-1));
+//   for (uint32_t i=0; i<tree_width; i++) {
+//     const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
+//     if (leaf == query ) {
+//       return i; // found the hash
+//     }    
+//   }
+//   return tree_height; // hash not found 
+// }
+
+
+
+// void generate_proof(uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash ) {
+//   uint32_t level_index = position;
+//   for(uint32_t level = tree_height - 1; level > 0; level--) {
+//     uint32_t lr;
+//     uint32_t neighbour_index;
+//     lr = level_index % 2;
+//     if (lr == 0) {
+//       // left
+//       neighbour_index = level_index + 1;
+//     } else {
+//       // right
+//       neighbour_index = level_index - 1;
+//     }
+//     proof_lr[level] = lr;
+//     proof_hash[level] = tree[tree_index(level,neighbour_index)];
+//     level_index /= 2;
+//   }
+//   // the proof must match this:
+//   proof_hash[0] = tree[tree_index(0,0)];
+// }
+
+
+// uint32_t validate_proof(const scalar_t hash, const uint32_t tree_height, const uint32_t* proof_lr, const scalar_t* proof_hash, Poseidon<scalar_t> &poseidon, cudaStream_t stream) {
+//   scalar_t hashes_in[2], hash_out[1], level_hash;
+//   level_hash = hash;
+//   for(uint32_t level = tree_height - 1; level > 0; level --) {
+//     if(proof_lr[level]==0) {
+//       hashes_in[0] = level_hash;
+//       hashes_in[1] = proof_hash[level];
+//     } else {
+//       hashes_in[0] = proof_hash[level];
+//       hashes_in[1] = level_hash;
+//     }
+//     // next level hash
+//     poseidon.hash_blocks(hashes_in, 1, hash_out, Poseidon<scalar_t>::HashType::MerkleTree, stream);
+//     level_hash = hash_out[0];
+//   }
+//   return proof_hash[0] == level_hash;
+// }
+
+// int main(int argc, char* argv[])
+// {
+
+//   std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
+//   const uint32_t tree_height = 21;
+//   std::cout << "Tree height: " << tree_height << std::endl;
+//   const uint32_t tree_arity = 2;
+//   const uint32_t leaf_level = tree_height-1;
+//   const uint32_t tree_width = 1 << leaf_level;
+//   std::cout << "Tree width: " << tree_width << std::endl;
+//   const uint32_t tree_size = (1 << tree_height) - 1;
+//   std::cout << "Tree size: " << tree_size << std::endl;
+//   scalar_t* tree = static_cast<scalar_t*>(malloc(tree_size * sizeof(scalar_t)));
+
+//   std::cout << "2. Hashing blocks in parallel" << std::endl;
+//   const uint32_t data_arity = 4;
+//   std::cout << "Block size (arity): " << data_arity << std::endl;
+//   std::cout << "Initializing blocks..." << std::endl;
+//   scalar_t d = scalar_t::zero();
+//   scalar_t* data = static_cast<scalar_t*>(malloc(tree_width * data_arity * sizeof(scalar_t)));
+//   for (uint32_t i = 0; i < tree_width * data_arity; i++) {
+//     data[i] = d;
+//     d = d + scalar_t::one();
+//   }
+
+
+//   std::cout << "Hashing blocks into tree leaves..." << std::endl;
+//   cudaStream_t stream;
+//   cudaStreamCreate(&stream);
+//   Poseidon<scalar_t> data_poseidon(data_arity, stream);
+//   data_poseidon.hash_blocks(data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<scalar_t>::HashType::MerkleTree, stream);
+
+//   std::cout << "3. Building Merkle tree" << std::endl;
+//   Poseidon<scalar_t> tree_poseidon(tree_arity, stream);
+//   build_tree(tree_height, tree, tree_poseidon, stream);
+
+//   std::cout << "4. Generate membership proof" << std::endl;
+//   uint32_t position = tree_width-1;
+//   std::cout << "Using the hash for block: " << position << std::endl;
+//   scalar_t query = tree[tree_index(leaf_level, position)];
+//   uint32_t query_position = query_membership(query, tree, tree_height);
+//   // allocate arrays for the proof
+//   uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
+//   scalar_t* proof_hash = static_cast<scalar_t*>(malloc(tree_height * sizeof(scalar_t)));
+//   generate_proof(query_position, tree, tree_height, proof_lr, proof_hash );
+
+//   std::cout << "5. Validate the hash membership" << std::endl;
+//   uint32_t validated;
+//   const scalar_t hash = tree[tree_index(leaf_level, query_position)];
+//   validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+//   std::cout << "Validated: " << validated << std::endl;
+
+//   std::cout << "6. Tamper the hash" << std::endl;
+//   const scalar_t tampered_hash = hash + scalar_t::one();
+//   validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+//   std::cout << "7. Invalidate tamper hash membership" << std::endl;
+//   std::cout << "Validated: " << validated << std::endl;
+
+//   return 0;
+// }
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -14,7 +14,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
-include_directories("../../.." "/icicle" "/opt/icicle")
+include_directories("../../.." "../../../icicle" "/icicle" "/icicle" "/opt/icicle")
 add_executable(
  example
  example.cu
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -45,8 +45,8 @@ The configuration is passed to the kernel as a structure of type `msm::MSMConfig
 ## What's in the example

 1. Define the parameters of MSM
-2. Generate random inputs on-host 
+2. Generate random inputs on-host
 3. Configure and execute MSM using on-host data
 4. Copy inputs on-device
 5. Configure and execute MSM using on-device data
-
+6. Repeat the above steps for G2 points
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -2,6 +2,7 @@
 #include <iostream>
 #include <iomanip>

+#define G2_DEFINED
 // include MSM template
 #define CURVE_ID 1
 #include "icicle/appUtils/msm/msm.cu"
@@ -16,6 +17,8 @@ int main(int argc, char* argv[])
  unsigned msm_size = 1048576;
  std::cout << "MSM size: " << msm_size << std::endl;
  int N = batch_size * msm_size;
+
+  std::cout << "Part I: use G1 points" << std::endl;
  
  std::cout << "Generating random inputs on-host" << std::endl;
  scalar_t* scalars = new scalar_t[N];
@@ -25,10 +28,27 @@ int main(int argc, char* argv[])
  projective_t::RandHostManyAffine(points, N);

  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
-  auto config = msm::DefaultMSMConfig();
+  // auto config = msm::DefaultMSMConfig();
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  msm::MSMConfig config = {
+      ctx,   // ctx
+      0,     // points_size
+      1,     // precompute_factor
+      0,     // c
+      0,     // bitsize
+      10,    // large_bucket_factor
+      1,     // batch_size
+      false, // are_scalars_on_device
+      false, // are_scalars_montgomery_form
+      false, // are_points_on_device
+      false, // are_points_montgomery_form
+      false, // are_results_on_device
+      false, // is_big_triangle
+      false, // is_async
+    };
  config.batch_size = batch_size;
  
-  std::cout << "Running MSM kernel" << std::endl;
+  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
  // Create two events to time the MSM kernel
  cudaStream_t stream = config.ctx.stream;
  cudaEvent_t start, stop;
@@ -95,6 +115,66 @@ int main(int argc, char* argv[])
  cudaFree(scalars_d);
  cudaFree(points_d);
  cudaFree(result_d);
+  // Free the host memory, keep scalars for G2 example
+  delete[] points;
+
+  std::cout << "Part II: use G2 points" << std::endl;
+
+  std::cout << "Generating random inputs on-host" << std::endl;
+  // use the same scalars
+  g2_affine_t* g2_points = new g2_affine_t[N];
+  g2_projective_t::RandHostManyAffine(g2_points, N);
+
+  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
+  config.are_results_on_device = false;
+  config.are_scalars_on_device = false;
+  config.are_points_on_device = false;
+  g2_projective_t g2_result;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, g2_points, msm_size, config, &g2_result);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  g2_affine_t* g2_points_d;
+  g2_projective_t* g2_result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars_d, g2_points_d, msm_size, config, g2_result_d);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  cudaFree(scalars_d);
+  cudaFree(g2_points_d);
+  cudaFree(g2_result_d);
+  delete[] g2_points;
+  delete[] scalars;
  cudaStreamDestroy(stream);
  return 0;
 }
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -14,6 +14,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
+include_directories("../../.." "/icicle")
 add_executable(
  example
  example.cu
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -17,7 +17,7 @@ Define a `CURVE_ID` and include curve configuration header:
 #include "icicle/curves/curve_config.cuh"
 ```

-The values of `CURVE_ID` for different curves are in the abobe header. Multiplication is accelerated both for field scalars and point fields.
+The values of `CURVE_ID` for different curves are in the above header. Multiplication is accelerated both for field scalars and point fields.

 ```c++
 using namespace curve_config;
@@ -25,6 +25,12 @@ scalar_t a;
 point_field_t b;
 ```

+## Running the example
+
+- `cd` to your example directory
+- compile with `./compile.sh`
+- run with `./run.sh`
+
 ## What's in the example

 1. Define the parameters for the example such as vector size 
@@ -32,3 +38,4 @@ point_field_t b;
 3. Copy them on-device
 4. Execute element-wise vector multiplication on-device
 5. Copy results on-host
+
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -1,16 +1,15 @@
 #include <iostream>
 #include <iomanip>
 #include <chrono>
-#include <cuda_runtime.h>
 #include <nvml.h>

 #define CURVE_ID 1
-#include "/icicle/icicle/curves/curve_config.cuh"
+#include "icicle/curves/curve_config.cuh"
 using namespace curve_config;
-typedef scalar_t T;
-// typedef point_field_t T;

-const std::string curve = "BN254";
+// select scalar or point field
+//typedef scalar_t T;
+typedef point_field_t T;

 #define MAX_THREADS_PER_BLOCK 256

@@ -49,7 +48,6 @@ int main(int argc, char** argv)
  } else {
    std::cerr << "Failed to get GPU model name." << std::endl;
  }
-
  unsigned power_limit;
  nvmlDeviceGetPowerManagementLimit(device, &power_limit);

@@ -83,21 +81,18 @@ int main(int argc, char** argv)
  T* device_out;

  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
-
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
-
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
-
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -105,28 +100,21 @@ int main(int argc, char** argv)

  // copy from host to device
  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }
-
+  
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
    vector_mult(device_in1, device_in2, device_out, vector_size);
-    // err = lde::Mul(device_in1, device_in2, vector_size, is_on_device, is_montgomery, ctx, device_out);
-    // if (err != cudaSuccess) {
-    //   std::cerr << "Failed to call lde::Mul" << cudaGetErrorString(err) << std::endl;
-    //   return 0;
-    // }
  }

  std::cout << "Starting benchmarking" << std::endl;
@@ -142,17 +130,10 @@ int main(int argc, char** argv)
    std::cerr << "Failed to get GPU temperature." << std::endl;
  }
  auto start_time = std::chrono::high_resolution_clock::now();
-
  // Benchmark loop
  for (int i = 0; i < repetitions; i++) {
    vector_mult(device_in1, device_in2, device_out, vector_size);
-    // err = lde::Mul(device_in1, device_in2, vector_size, is_on_device, is_montgomery, ctx, device_out);
-    // if (err != cudaSuccess) {
-    //   std::cerr << "Failed to call lde::Mul" << cudaGetErrorString(err) << std::endl;
-    //   return 0;
-    // }
  }
-
  auto end_time = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
  std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
@@ -179,14 +160,13 @@ int main(int argc, char** argv)

  // validate multiplication here...

-  free(host_in1);
+  // clean up and exit
+  free(host_in1); 
  free(host_in2);
  free(host_out);
  cudaFree(device_in1);
  cudaFree(device_in2);
  cudaFree(device_out);
-
  nvmlShutdown();
-
  return 0;
 }
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-/icicle-example/build/example
+./build/example
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -14,7 +14,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
-include_directories("/icicle")
+include_directories("../../.." "../../../icicle" "/icicle" "/icicle/icicle" "/opt/icicle/icicle")
 add_executable(
  example
  example.cu
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -6,28 +6,28 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s

 ## Key-Takeaway

-`Icicle` provides several CUDA C++ template functions for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform. The templates differ in terms of ease-of-use vs. speed. In this example we look a the easiest one.
+`Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform. 

 ## Concise Usage Explanation

-First include NTT template, next select the curve, and finally supply the curve types to the template. 
+1. Select the curve
+2. Include NTT template
+3. Configure NTT (TODO)
+4. Call NTT

 ```c++
-#include "icicle/appUtils/ntt/ntt.cuh"              // template
-#include "icicle/curves/bls12_381/curve_config.cuh" // curve
-using namespace BLS12_381;
+#define CURVE_ID 1
+#include "icicle/appUtils/ntt/ntt.cu"
+using namespace curve_config;
+
 ...
-ntt_end2end_batch_template<scalar_t, scalar_t>(scalars, batch_size, ntt_size, inverse, stream);
+ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
 ```

-In this example we use `BLS12_381` curve. The function computes TODO.
-
-
 **Parameters:**

 TODO

-
 ## What's in the example

 TODO
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -1,61 +1,57 @@
 #include <chrono>
 #include <iostream>

-// include NTT template
-#include "icicle/appUtils/ntt/ntt.cuh"
-
 // select the curve
-#include "icicle/curves/bls12_381/curve_config.cuh"
-using namespace BLS12_381;
+#define CURVE_ID 1
+// include NTT template
+#include "icicle/appUtils/ntt/ntt.cu"
+using namespace curve_config;

 // Operate on scalars
 typedef scalar_t S;
 typedef scalar_t E;

-scalar_t smult(const unsigned n, scalar_t s)
-{
-  scalar_t r = scalar_t::zero();
+void print_elements(const unsigned n, E * elements ) {
  for (unsigned i = 0; i < n; i++) {
-    r = r + s;
+    std::cout << i << ": " << elements[i] << std::endl;   
  }
-  return r;
 }

-void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
-{
-  // Harmonics 0
-  for (unsigned i = 0; i < ntt_size; i = i + 1) {
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+  // Lowest Harmonics
+  for (unsigned i = 0; i < ntt_size; i=i+1) {
    elements[i] = scalar_t::one();
  }
-  // Harmonics 1
-  for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
-    elements[i] = scalar_t::one();
-    elements[i + 1] = scalar_t::neg(scalar_t::one());
+  // print_elements(ntt_size, elements );
+  // Highest Harmonics
+  for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
+    elements[i] =  scalar_t::one();
+    elements[i+1] = scalar_t::neg(scalar_t::one());
  }
+  // print_elements(ntt_size, &elements[1*ntt_size] );
 }

 int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
 {
  int nof_errors = 0;
-  E amplitude = smult(ntt_size, scalar_t::one());
+  E amplitude = scalar_t::from((uint32_t) ntt_size);
  // std::cout << "Amplitude: " << amplitude << std::endl;
-  // Harmonics 0
+  // Lowest Harmonics
  if (elements[0] != amplitude) {
    ++nof_errors;
-    std::cout << "Error in harmonics 0: " << elements[0] << std::endl;
+    std::cout << "Error in lowest harmonics 0! " << std::endl;
+    // print_elements(ntt_size, elements );
  } else {
-    std::cout << "Validated harmonics 0" << std::endl;
+    std::cout << "Validated lowest harmonics" << std::endl;
  }
-  // Harmonics 1
-  if (elements[ntt_size + 1] != amplitude) {
+  // Highest Harmonics 
+  if (elements[1*ntt_size+ntt_size/2] != amplitude) {
    ++nof_errors;
-    std::cout << "Error in harmonics 1: " << elements[ntt_size + 1] << std::endl;
+    std::cout << "Error in highest harmonics! " << std::endl;
+    // print_elements(ntt_size, &elements[1*ntt_size] );
  } else {
-    std::cout << "Validated harmonics 1" << std::endl;
+    std::cout << "Validated highest harmonics" << std::endl;
  }
-  // for (unsigned i = 0; i < nof_ntts * ntt_size; i++) {
-  //   std::cout << elements[i] << std::endl;
-  // }
  return nof_errors;
 }

@@ -63,59 +59,41 @@ int main(int argc, char* argv[])
 {
  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
  std::cout << "Example parameters" << std::endl;
-  const unsigned log_ntt_size = 26;
+  const unsigned log_ntt_size = 20;
  std::cout << "Log2(NTT size): " << log_ntt_size << std::endl;
  const unsigned ntt_size = 1 << log_ntt_size;
  std::cout << "NTT size: " << ntt_size << std::endl;
  const unsigned nof_ntts = 2;
  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
  const unsigned batch_size = nof_ntts * ntt_size;
-
-  std::cout << "Generating input data for harmonics 0,1" << std::endl;
-  E* elements;
-  elements = (scalar_t*)malloc(sizeof(E) * batch_size);
-  initialize_input(ntt_size, nof_ntts, elements);
-
-  std::cout << "Running easy-to-use NTT" << std::endl;
+  
+  std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
+  E* input;
+  input = (scalar_t*) malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, input );
+  E* output;
+  output = (scalar_t*) malloc(sizeof(E) * batch_size);
+  
+  std::cout << "Running NTT with on-host data" << std::endl;
  cudaStream_t stream;
  cudaStreamCreate(&stream);
-
-  bool inverse = false;
+  // Create a device context
+  auto ctx = device_context::get_default_device_context();
+  // the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
+  scalar_t rou = scalar_t{ {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1} };
+  ntt::InitDomain(rou, ctx);
+  // Create an NTTConfig instance
+  ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
+  config.batch_size = nof_ntts;
+  config.ctx.stream = stream;
  auto begin0 = std::chrono::high_resolution_clock::now();
-  ntt_end2end_batch_template<scalar_t, scalar_t>(elements, batch_size, ntt_size, inverse, stream);
+  cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
  auto end0 = std::chrono::high_resolution_clock::now();
  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
-  validate_output(ntt_size, nof_ntts, elements);
-  cudaStreamSynchronize(stream);
-
-  std::cout << "Running not that easy-to-use but fast NTT" << std::endl;
-
-  uint32_t n_twiddles = ntt_size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
-  // represent transform matrix using twiddle factors
-  scalar_t* d_twiddles;
-  d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(log_ntt_size), stream); // Sscalar
-  scalar_t* d_elements;                                                                       // Element
-
-  cudaMallocAsync(&d_elements, sizeof(scalar_t) * batch_size, stream);
-  initialize_input(ntt_size, nof_ntts, elements);
-  cudaMemcpyAsync(d_elements, elements, sizeof(scalar_t) * batch_size, cudaMemcpyHostToDevice, stream);
-  S* _null = nullptr;
-  auto begin1 = std::chrono::high_resolution_clock::now();
-  cudaStreamSynchronize(stream);
-  ntt_inplace_batch_template(d_elements, d_twiddles, ntt_size, nof_ntts, inverse, false, _null, stream, false);
-  cudaStreamSynchronize(stream);
-  auto end1 = std::chrono::high_resolution_clock::now();
-  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
-  printf("Runtime: %.3e seconds\n", elapsed1.count() * 1e-9);
-
-  cudaMemcpyAsync(elements, d_elements, sizeof(E) * batch_size, cudaMemcpyDeviceToHost, stream);
-  validate_output(ntt_size, nof_ntts, elements);
-  cudaFreeAsync(d_elements, stream);
-  cudaFreeAsync(d_twiddles, stream);
-
+  validate_output(ntt_size, nof_ntts, output );
  cudaStreamDestroy(stream);
-
-  free(elements);
+  free(input);
+  free(output);
  return 0;
 }
--- a/examples/c++/ntt/run.sh
+++ b/examples/c++/ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -4,10 +4,10 @@ version = "1.0.0"
 edition = "2018"

 [dependencies]
-icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
-icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
-icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
-icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0" }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0" }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0" }

 ark-bn254 = { version = "0.4.0", optional = true}
 ark-bls12-377 = { version = "0.4.0", optional = true}
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -12,7 +12,7 @@ use icicle_bls12_377::curve::{

 use icicle_cuda_runtime::{
    stream::CudaStream,
-    memory::DeviceSlice
+    memory::HostOrDeviceSlice
 };

 use icicle_core::{
@@ -74,40 +74,39 @@ fn main() {
        let size = 1 << log_size;
        println!("---------------------- MSM size 2^{}={} ------------------------", log_size, size);
        // Setting Bn254 points and scalars
-        let points = &upper_points[..size];
-        let scalars = &upper_scalars[..size];
+        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
+        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
        
        // Setting bls12377 points and scalars
-        let points_bls12377 = &upper_points_bls12377[..size];
-        let scalars_bls12377 = &upper_scalars_bls12377[..size];
+        // let points_bls12377 = &upper_points_bls12377[..size];
+        let points_bls12377 =  HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());

        println!("Configuring bn254 MSM...");
-        let mut msm_results: DeviceSlice<'_, G1Projective> = DeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream = CudaStream::create().unwrap();
        let mut cfg = msm::get_default_msm_config::<CurveCfg>();
        cfg.ctx.stream = &stream;
        cfg.is_async = true;
-        cfg.are_results_on_device = true;

        println!("Configuring bls12377 MSM...");
-        let mut msm_results_bls12377: DeviceSlice<'_, BLS12377G1Projective> = DeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream_bls12377 = CudaStream::create().unwrap();
        let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
        cfg_bls12377.ctx.stream = &stream_bls12377;
        cfg_bls12377.is_async = true;
-        cfg_bls12377.are_results_on_device = true;

        println!("Executing bn254 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(&scalars, &points, &cfg, &mut msm_results.as_slice()).unwrap();
+        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
        #[cfg(feature = "profile")]
        println!("ICICLE BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

        println!("Executing bls12377 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(&scalars_bls12377, &points_bls12377, &cfg_bls12377, &mut msm_results_bls12377.as_slice()).unwrap();
+        msm::msm(&scalars_bls12377, &points_bls12377, &cfg_bls12377, &mut msm_results_bls12377 ).unwrap();
        #[cfg(feature = "profile")]
        println!("ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -4,10 +4,10 @@ version = "1.0.0"
 edition = "2018"

 [dependencies]
-icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
-icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
-icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
-icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0", features = ["arkworks"] }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0", features = ["arkworks"] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.0.0", features = ["arkworks"] }

 ark-ff = { version = "0.4.0" }
 ark-poly = "0.4.0"
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -10,7 +10,7 @@ use icicle_bls12_377::curve::{

 use icicle_cuda_runtime::{
    stream::CudaStream,
-    memory::DeviceSlice,
+    memory::HostOrDeviceSlice,
    device_context::get_default_device_context
 };

@@ -48,13 +48,13 @@ fn main() {
    println!("---------------------- NTT size 2^{}={} ------------------------", log_size, size);
    // Setting Bn254 points and scalars
    println!("Generating random inputs on host for bn254...");
-    let scalars = ScalarCfg::generate_random(size);
-    let mut ntt_results: DeviceSlice<'_, ScalarField> = DeviceSlice::cuda_malloc(size).unwrap();
+    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
+    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
    
    // Setting bls12377 points and scalars
    println!("Generating random inputs on host for bls12377...");
-    let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
-    let mut ntt_results_bls12377: DeviceSlice<'_, BLS12377ScalarField> = DeviceSlice::cuda_malloc(size).unwrap();
+    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
+    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
    
    println!("Setting up bn254 Domain...");
    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
@@ -66,7 +66,7 @@ fn main() {
    let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
    cfg.ctx.stream = &stream;
    cfg.is_async = true;
-    cfg.are_outputs_on_device = true;
+    // cfg.are_outputs_on_device = true;

    println!("Setting up bls12377 Domain...");
    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
@@ -78,19 +78,19 @@ fn main() {
    let mut cfg_bls12377 = ntt::get_default_ntt_config::<BLS12377ScalarField>();
    cfg_bls12377.ctx.stream = &stream_bls12377;
    cfg_bls12377.is_async = true;
-    cfg_bls12377.are_outputs_on_device = true;
+    // cfg_bls12377.are_outputs_on_device = true;

    println!("Executing bn254 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(scalars.as_slice(), ntt::NTTDir::kForward, &cfg, ntt_results.as_slice()).unwrap();
+    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
    #[cfg(feature = "profile")]
    println!("ICICLE BN254 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());

    println!("Executing bls12377 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(scalars_bls12377.as_slice(), ntt::NTTDir::kForward, &cfg_bls12377, ntt_results_bls12377.as_slice()).unwrap();
+    ntt::ntt(&scalars_bls12377, ntt::NTTDir::kForward, &cfg_bls12377, &mut ntt_results_bls12377).unwrap();
    #[cfg(feature = "profile")]
    println!("ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());

@@ -112,10 +112,10 @@ fn main() {
        .unwrap();
    
    println!("Checking against arkworks...");
-    let mut ark_scalars: Vec<Bn254Fr> = scalars.iter().map(|scalar| scalar.to_ark()).collect();
+    let mut ark_scalars: Vec<Bn254Fr> = scalars.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
    
-    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.iter().map(|scalar| scalar.to_ark()).collect();
+    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
    let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
    
    #[cfg(feature = "profile")]
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -84,6 +84,10 @@ if (NOT IS_CURVE_SUPPORTED)
  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
 endif ()

+if (G2_DEFINED STREQUAL "ON")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
+endif ()
+
 option(BUILD_TESTS "Build tests" OFF)

 if (NOT BUILD_TESTS)
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -364,11 +364,10 @@ namespace msm {
      CHK_INIT_IF_RETURN();

      const unsigned nof_scalars = batch_size * single_msm_size; // assuming scalars not shared between batch elements
-      const bool is_nof_points_valid = (nof_points == single_msm_size) || (nof_points == single_msm_size * batch_size);
+      const bool is_nof_points_valid = ((single_msm_size * batch_size) % nof_points == 0);
      if (!is_nof_points_valid) {
        THROW_ICICLE_ERR(
-          IcicleError_t::InvalidArgument, "bucket_method_msm: #points must be either (1) single_msm_size if sharing "
-                                          "points or (2) single_msm_size*batch_size");
+          IcicleError_t::InvalidArgument, "bucket_method_msm: #points must be divisible by single_msm_size*batch_size");
      }

      S* d_scalars;
@@ -788,7 +787,8 @@ namespace msm {
    }
  } // namespace

-  extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)()
+  template <typename A>
+  MSMConfig DefaultMSMConfig()
  {
    device_context::DeviceContext ctx = device_context::get_default_device_context();
    MSMConfig config = {
@@ -850,6 +850,11 @@ namespace msm {
      scalars, points, msm_size, config, out);
  }

+  /**
+   * Extern "C" version of [DefaultMSMConfig](@ref DefaultMSMConfig) function.
+   */
+  extern "C" MSMConfig CONCAT_EXPAND(CURVE, DefaultMSMConfig)() { return DefaultMSMConfig<curve_config::affine_t>(); }
+
 #if defined(G2_DEFINED)

  /**
@@ -860,7 +865,7 @@ namespace msm {
   *  - `P` is the [projective representation](@ref g2_projective_t) of G2 curve points.
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
-  extern "C" cudaError_t G2MSMCuda(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2MSMCuda)(
    curve_config::scalar_t* scalars,
    curve_config::g2_affine_t* points,
    int msm_size,
@@ -871,6 +876,15 @@ namespace msm {
      scalars, points, msm_size, config, out);
  }

+  /**
+   * Extern "C" version of [DefaultMSMConfig](@ref DefaultMSMConfig) function for the G2 curve
+   * (functionally no different than the default MSM config function for G1).
+   */
+  extern "C" MSMConfig CONCAT_EXPAND(CURVE, G2DefaultMSMConfig)()
+  {
+    return DefaultMSMConfig<curve_config::g2_affine_t>();
+  }
+
 #endif

 } // namespace msm
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -83,7 +83,8 @@ namespace msm {
   * A function that returns the default value of [MSMConfig](@ref MSMConfig) for the [MSM](@ref MSM) function.
   * @return Default value of [MSMConfig](@ref MSMConfig).
   */
-  extern "C" MSMConfig DefaultMSMConfig();
+  template <typename A>
+  MSMConfig DefaultMSMConfig();

  /**
   * A function that computes MSM: \f$ MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i \f$.
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -417,12 +417,18 @@ namespace ntt {
  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
  {
    CHK_INIT_IF_RETURN();
+    if (size > Domain<S>::max_size) {
+      std::cerr
+        << "NTT size is too large for the domain. Consider generating your domain with a higher order root of unity"
+        << '\n';
+      throw -1;
+    }

    cudaStream_t& stream = config.ctx.stream;
    int batch_size = config.batch_size;
    int logn = int(log(size) / log(2));
    int input_size_bytes = size * batch_size * sizeof(E);
-    bool are_inputs_on_device = config.are_inputs_on_device; // TODO: unify name to is_
+    bool are_inputs_on_device = config.are_inputs_on_device;
    bool are_outputs_on_device = config.are_outputs_on_device;

    S* coset = nullptr;
@@ -540,7 +546,7 @@ namespace ntt {
   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
-  extern "C" cudaError_t ECNTTCuda(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
    curve_config::projective_t* input,
    int size,
    NTTDir dir,
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -87,10 +87,12 @@ namespace ntt {
   * @return Default value of [NTTConfig](@ref NTTConfig).
   */
  template <typename S>
-  NTTConfig<S> CONCAT_EXPAND(CURVE, DefaultNTTConfig)();
+  NTTConfig<S> DefaultNTTConfig();

  /**
-   * A function that computes NTT or iNTT in-place.
+   * A function that computes NTT or iNTT in-place. It's necessary to call [InitDomain](@ref InitDomain) with an
+   * appropriate primitive root before calling this function (only one call to `InitDomain` should suffice for all
+   * NTTs).
   * @param input Input of the NTT. Length of this array needs to be \f$ size \cdot config.batch\_size \f$. Note
   * that if inputs are in Montgomery form, the outputs will be as well and vice-versa: non-Montgomery inputs produce
   * non-Montgomety outputs.
--- a/icicle/curves/curve_config.cuh
+++ b/icicle/curves/curve_config.cuh
@@ -28,17 +28,36 @@ using namespace bls12_377;
 using namespace bw6_761;
 #endif

+/**
+ * @namespace curve_config
+ * Namespace with type definitions for short Weierstrass pairing-friendly [elliptic
+ * curves](https://hyperelliptic.org/EFD/g1p/auto-shortw.html). Here, concrete types are created in accordance
+ * with the `-DCURVE` env variable passed during build.
+ */
 namespace curve_config {

 #if CURVE_ID == BW6_761
  typedef bls12_377::fq_config fp_config;
 #endif
+  /**
+   * Scalar field of the curve. Is always a prime field.
+   */
  typedef Field<fp_config> scalar_t;
+  /**
+   * Base field of G1 curve. Is always a prime field.
+   */
  typedef Field<fq_config> point_field_t;
  static constexpr point_field_t generator_x = point_field_t{g1_gen_x};
  static constexpr point_field_t generator_y = point_field_t{g1_gen_y};
  static constexpr point_field_t b = point_field_t{weierstrass_b};
+  /**
+   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html)
+   * of G1 curve consisting of three coordinates of type [point_field_t](point_field_t).
+   */
  typedef Projective<point_field_t, scalar_t, b, generator_x, generator_y> projective_t;
+  /**
+   * Affine representation of G1 curve consisting of two coordinates of type [point_field_t](point_field_t).
+   */
  typedef Affine<point_field_t> affine_t;

 #if defined(G2_DEFINED)
@@ -56,10 +75,16 @@ namespace curve_config {
  static constexpr g2_point_field_t g2_b =
    g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
 #endif
+  /**
+   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) of G2 curve.
+   */
  typedef Projective<g2_point_field_t, scalar_t, g2_b, g2_generator_x, g2_generator_y> g2_projective_t;
+  /**
+   * Affine representation of G1 curve.
+   */
  typedef Affine<g2_point_field_t> g2_affine_t;
 #endif

 } // namespace curve_config

-#endif
+#endif
--- a/icicle/primitives/extension_field.cuh
+++ b/icicle/primitives/extension_field.cuh
@@ -38,14 +38,14 @@ public:

  static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }

-  static constexpr HOST_DEVICE_INLINE ExtensionField generator_x()
+  static constexpr HOST_DEVICE_INLINE ExtensionField ToMontgomery(const ExtensionField& xs)
  {
-    return ExtensionField{FF{CONFIG::g2_gen_x_re}, FF{CONFIG::g2_gen_x_im}};
+    return ExtensionField{xs.real * FF{CONFIG::montgomery_r}, xs.imaginary * FF{CONFIG::montgomery_r}};
  }

-  static constexpr HOST_DEVICE_INLINE ExtensionField generator_y()
+  static constexpr HOST_DEVICE_INLINE ExtensionField FromMontgomery(const ExtensionField& xs)
  {
-    return ExtensionField{FF{CONFIG::g2_gen_y_re}, FF{CONFIG::g2_gen_y_im}};
+    return ExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
  }

  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
@@ -155,4 +155,4 @@ public:
    FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
    return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()};
  }
-};
+};
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
@@ -750,13 +750,6 @@ public:
    return rs;
  }

-  static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
-
-  static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
-  {
-    return xs * Field{CONFIG::montgomery_r_inv};
-  }
-
  /**
   * This method reduces a Wide number `xs` modulo `p` and returns the result as a Field element.
   *
--- a/icicle/primitives/projective.cu
+++ b/icicle/primitives/projective.cu
@@ -37,7 +37,7 @@ extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int
 #define g2_affine_t      curve_config::g2_affine_t
 #define g2_point_field_t curve_config::g2_point_field_t

-extern "C" bool EqG2(g2_projective_t* point1, g2_projective_t* point2)
+extern "C" bool CONCAT_EXPAND(CURVE, G2Eq)(g2_projective_t* point1, g2_projective_t* point2)
 {
  return (*point1 == *point2) &&
         !((point1->x == g2_point_field_t::zero()) && (point1->y == g2_point_field_t::zero()) &&
@@ -46,17 +46,17 @@ extern "C" bool EqG2(g2_projective_t* point1, g2_projective_t* point2)
           (point2->z == g2_point_field_t::zero()));
 }

-extern "C" void ToAffineG2(g2_projective_t* point, affine_t* point_out)
+extern "C" void CONCAT_EXPAND(CURVE, G2ToAffine)(g2_projective_t* point, g2_affine_t* point_out)
 {
-  *point_out = projective_t::to_affine(*point);
+  *point_out = g2_projective_t::to_affine(*point);
 }

-extern "C" void GenerateProjectivePointsG2(g2_projective_t* points, int size)
+extern "C" void CONCAT_EXPAND(CURVE, G2GenerateProjectivePoints)(g2_projective_t* points, int size)
 {
  g2_projective_t::RandHostMany(points, size);
 }

-extern "C" void GenerateAffinePointsG2(g2_affine_t* points, int size)
+extern "C" void CONCAT_EXPAND(CURVE, G2GenerateAffinePoints)(g2_affine_t* points, int size)
 {
  g2_projective_t::RandHostManyAffine(points, size);
 }
--- a/icicle/utils/mont.cu
+++ b/icicle/utils/mont.cu
@@ -33,4 +33,28 @@ namespace mont {
      return FromMontgomery(d_inout, n, ctx.stream, d_inout);
    }
  }
+
+#if defined(G2_DEFINED)
+
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2AffineConvertMontgomery)(
+    curve_config::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+  {
+    if (is_into) {
+      return ToMontgomery(d_inout, n, ctx.stream, d_inout);
+    } else {
+      return FromMontgomery(d_inout, n, ctx.stream, d_inout);
+    }
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, G2ProjectiveConvertMontgomery)(
+    curve_config::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx)
+  {
+    if (is_into) {
+      return ToMontgomery(d_inout, n, ctx.stream, d_inout);
+    } else {
+      return FromMontgomery(d_inout, n, ctx.stream, d_inout);
+    }
+  }
+
+#endif
 } // namespace mont
--- a/wrappers/rust/icicle-core/Cargo.toml
+++ b/wrappers/rust/icicle-core/Cargo.toml
@@ -17,15 +17,9 @@ ark-ec = { version = "0.4.0", optional = true, features = [ "parallel" ] }
 ark-poly = { version = "0.4.0", optional = true }
 ark-std = { version = "0.4.0", optional = true }

-# [build-dependencies]
-# cc = { version = "1.0", features = ["parallel"] }
-# cmake = "*"
-# bindgen = "*"
-# libc = "*" #TODO: move libc dependencies to build 
-
 [features]
 default = []
 arkworks = ["ark-ff", "ark-ec", "ark-poly", "ark-std"]
-# TODO: impl G2 and EC NTT
 g2 = []
+# TODO: impl EC NTT
 ec_ntt = []
--- a/wrappers/rust/icicle-core/src/curve.rs
+++ b/wrappers/rust/icicle-core/src/curve.rs
@@ -204,68 +204,83 @@ where
 macro_rules! impl_curve {
    (
        $curve_prefix:literal,
+        $curve_prefix_ident:ident,
        $curve:ident,
        $scalar_field:ident,
-        $base_field:ident
+        $base_field:ident,
+        $ark_config:ident,
+        $affine_type:ident,
+        $projective_type:ident
    ) => {
        #[derive(Debug, PartialEq, Copy, Clone)]
        pub struct $curve {}

-        pub type G1Affine = Affine<$curve>;
-        pub type G1Projective = Projective<$curve>;
+        pub type $affine_type = Affine<$curve>;
+        pub type $projective_type = Projective<$curve>;

-        extern "C" {
-            #[link_name = concat!($curve_prefix, "Eq")]
-            fn eq(point1: *const G1Projective, point2: *const G1Projective) -> bool;
-            #[link_name = concat!($curve_prefix, "ToAffine")]
-            fn proj_to_affine(point: *const G1Projective, point_out: *mut G1Affine);
-            #[link_name = concat!($curve_prefix, "GenerateProjectivePoints")]
-            fn generate_projective_points(points: *mut G1Projective, size: usize);
-            #[link_name = concat!($curve_prefix, "GenerateAffinePoints")]
-            fn generate_affine_points(points: *mut G1Affine, size: usize);
-            #[link_name = concat!($curve_prefix, "AffineConvertMontgomery")]
-            fn _convert_affine_montgomery(
-                points: *mut G1Affine,
-                size: usize,
-                is_into: bool,
-                ctx: *const DeviceContext,
-            ) -> CudaError;
-            #[link_name = concat!($curve_prefix, "ProjectiveConvertMontgomery")]
-            fn _convert_projective_montgomery(
-                points: *mut G1Projective,
-                size: usize,
-                is_into: bool,
-                ctx: *const DeviceContext,
-            ) -> CudaError;
+        mod $curve_prefix_ident {
+            use super::{$affine_type, $projective_type, CudaError, DeviceContext};
+
+            extern "C" {
+                #[link_name = concat!($curve_prefix, "Eq")]
+                pub(crate) fn eq(point1: *const $projective_type, point2: *const $projective_type) -> bool;
+                #[link_name = concat!($curve_prefix, "ToAffine")]
+                pub(crate) fn proj_to_affine(point: *const $projective_type, point_out: *mut $affine_type);
+                #[link_name = concat!($curve_prefix, "GenerateProjectivePoints")]
+                pub(crate) fn generate_projective_points(points: *mut $projective_type, size: usize);
+                #[link_name = concat!($curve_prefix, "GenerateAffinePoints")]
+                pub(crate) fn generate_affine_points(points: *mut $affine_type, size: usize);
+                #[link_name = concat!($curve_prefix, "AffineConvertMontgomery")]
+                pub(crate) fn _convert_affine_montgomery(
+                    points: *mut $affine_type,
+                    size: usize,
+                    is_into: bool,
+                    ctx: *const DeviceContext,
+                ) -> CudaError;
+                #[link_name = concat!($curve_prefix, "ProjectiveConvertMontgomery")]
+                pub(crate) fn _convert_projective_montgomery(
+                    points: *mut $projective_type,
+                    size: usize,
+                    is_into: bool,
+                    ctx: *const DeviceContext,
+                ) -> CudaError;
+            }
        }

        impl Curve for $curve {
            type BaseField = $base_field;
            type ScalarField = $scalar_field;

-            fn eq_proj(point1: *const G1Projective, point2: *const G1Projective) -> bool {
-                unsafe { eq(point1, point2) }
+            fn eq_proj(point1: *const $projective_type, point2: *const $projective_type) -> bool {
+                unsafe { $curve_prefix_ident::eq(point1, point2) }
            }

-            fn to_affine(point: *const Projective<$curve>, point_out: *mut Affine<$curve>) {
-                unsafe { proj_to_affine(point, point_out) };
+            fn to_affine(point: *const $projective_type, point_out: *mut $affine_type) {
+                unsafe { $curve_prefix_ident::proj_to_affine(point, point_out) };
            }

-            fn generate_random_projective_points(size: usize) -> Vec<G1Projective> {
-                let mut res = vec![G1Projective::zero(); size];
-                unsafe { generate_projective_points(&mut res[..] as *mut _ as *mut G1Projective, size) };
-                res
-            }
-
-            fn generate_random_affine_points(size: usize) -> Vec<G1Affine> {
-                let mut res = vec![G1Affine::zero(); size];
-                unsafe { generate_affine_points(&mut res[..] as *mut _ as *mut G1Affine, size) };
-                res
-            }
-
-            fn convert_affine_montgomery(points: &mut HostOrDeviceSlice<G1Affine>, is_into: bool) -> CudaError {
+            fn generate_random_projective_points(size: usize) -> Vec<$projective_type> {
+                let mut res = vec![$projective_type::zero(); size];
                unsafe {
-                    _convert_affine_montgomery(
+                    $curve_prefix_ident::generate_projective_points(
+                        &mut res[..] as *mut _ as *mut $projective_type,
+                        size,
+                    )
+                };
+                res
+            }
+
+            fn generate_random_affine_points(size: usize) -> Vec<$affine_type> {
+                let mut res = vec![$affine_type::zero(); size];
+                unsafe {
+                    $curve_prefix_ident::generate_affine_points(&mut res[..] as *mut _ as *mut $affine_type, size)
+                };
+                res
+            }
+
+            fn convert_affine_montgomery(points: &mut HostOrDeviceSlice<$affine_type>, is_into: bool) -> CudaError {
+                unsafe {
+                    $curve_prefix_ident::_convert_affine_montgomery(
                        points.as_mut_ptr(),
                        points.len(),
                        is_into,
@@ -274,9 +289,12 @@ macro_rules! impl_curve {
                }
            }

-            fn convert_projective_montgomery(points: &mut HostOrDeviceSlice<G1Projective>, is_into: bool) -> CudaError {
+            fn convert_projective_montgomery(
+                points: &mut HostOrDeviceSlice<$projective_type>,
+                is_into: bool,
+            ) -> CudaError {
                unsafe {
-                    _convert_projective_montgomery(
+                    $curve_prefix_ident::_convert_projective_montgomery(
                        points.as_mut_ptr(),
                        points.len(),
                        is_into,
@@ -286,7 +304,7 @@ macro_rules! impl_curve {
            }

            #[cfg(feature = "arkworks")]
-            type ArkSWConfig = ArkG1Config;
+            type ArkSWConfig = $ark_config;
        }
    };
 }
--- a/wrappers/rust/icicle-core/src/field.rs
+++ b/wrappers/rust/icicle-core/src/field.rs
@@ -2,7 +2,7 @@
 use crate::traits::ArkConvertible;
 use crate::traits::{FieldConfig, FieldImpl, MontgomeryConvertible};
 #[cfg(feature = "arkworks")]
-use ark_ff::{BigInteger, PrimeField};
+use ark_ff::{BigInteger, Field as ArkField, PrimeField};
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use std::fmt::{Debug, Display};
@@ -100,6 +100,7 @@ impl<const NUM_LIMBS: usize, F: FieldConfig> FieldImpl for Field<NUM_LIMBS, F> {
    }
 }

+#[doc(hidden)]
 pub trait MontgomeryConvertibleField<F: FieldImpl> {
    fn to_mont(values: &mut HostOrDeviceSlice<F>) -> CudaError;
    fn from_mont(values: &mut HostOrDeviceSlice<F>) -> CudaError;
@@ -123,12 +124,19 @@ impl<const NUM_LIMBS: usize, F: FieldConfig> ArkConvertible for Field<NUM_LIMBS,
    type ArkEquivalent = F::ArkField;

    fn to_ark(&self) -> Self::ArkEquivalent {
-        F::ArkField::from_le_bytes_mod_order(&self.to_bytes_le())
+        F::ArkField::from_random_bytes(&self.to_bytes_le()).unwrap()
    }

    fn from_ark(ark: Self::ArkEquivalent) -> Self {
-        let ark_bigint: <Self::ArkEquivalent as PrimeField>::BigInt = ark.into();
-        Self::from_bytes_le(&ark_bigint.to_bytes_le())
+        let ark_bytes: Vec<u8> = ark
+            .to_base_prime_field_elements()
+            .map(|x| {
+                x.into_bigint()
+                    .to_bytes_le()
+            })
+            .flatten()
+            .collect();
+        Self::from_bytes_le(&ark_bytes)
    }
 }

@@ -156,6 +164,7 @@ macro_rules! impl_field {
 macro_rules! impl_scalar_field {
    (
        $field_prefix:literal,
+        $field_prefix_ident:ident,
        $num_limbs:ident,
        $field_name:ident,
        $field_cfg:ident,
@@ -163,45 +172,52 @@ macro_rules! impl_scalar_field {
    ) => {
        impl_field!($num_limbs, $field_name, $field_cfg, $ark_equiv);

-        extern "C" {
-            #[link_name = concat!($field_prefix, "GenerateScalars")]
-            fn generate_scalars(scalars: *mut $field_name, size: usize);
+        mod $field_prefix_ident {
+            use crate::curve::{get_default_device_context, $field_name, CudaError, DeviceContext, HostOrDeviceSlice};

-            #[link_name = concat!($field_prefix, "ScalarConvertMontgomery")]
-            fn _convert_scalars_montgomery(
-                scalars: *mut $field_name,
-                size: usize,
+            extern "C" {
+                #[link_name = concat!($field_prefix, "GenerateScalars")]
+                pub(crate) fn generate_scalars(scalars: *mut $field_name, size: usize);
+
+                #[link_name = concat!($field_prefix, "ScalarConvertMontgomery")]
+                fn _convert_scalars_montgomery(
+                    scalars: *mut $field_name,
+                    size: usize,
+                    is_into: bool,
+                    ctx: *const DeviceContext,
+                ) -> CudaError;
+            }
+
+            pub(crate) fn convert_scalars_montgomery(
+                scalars: &mut HostOrDeviceSlice<$field_name>,
                is_into: bool,
-                ctx: *const DeviceContext,
-            ) -> CudaError;
+            ) -> CudaError {
+                unsafe {
+                    _convert_scalars_montgomery(
+                        scalars.as_mut_ptr(),
+                        scalars.len(),
+                        is_into,
+                        &get_default_device_context() as *const _ as *const DeviceContext,
+                    )
+                }
+            }
        }

        impl GenerateRandom<$field_name> for $field_cfg {
            fn generate_random(size: usize) -> Vec<$field_name> {
                let mut res = vec![$field_name::zero(); size];
-                unsafe { generate_scalars(&mut res[..] as *mut _ as *mut $field_name, size) };
+                unsafe { $field_prefix_ident::generate_scalars(&mut res[..] as *mut _ as *mut $field_name, size) };
                res
            }
        }

-        fn convert_scalars_montgomery(scalars: &mut HostOrDeviceSlice<$field_name>, is_into: bool) -> CudaError {
-            unsafe {
-                _convert_scalars_montgomery(
-                    scalars.as_mut_ptr(),
-                    scalars.len(),
-                    is_into,
-                    &get_default_device_context() as *const _ as *const DeviceContext,
-                )
-            }
-        }
-
        impl MontgomeryConvertibleField<$field_name> for $field_cfg {
            fn to_mont(values: &mut HostOrDeviceSlice<$field_name>) -> CudaError {
-                convert_scalars_montgomery(values, true)
+                $field_prefix_ident::convert_scalars_montgomery(values, true)
            }

            fn from_mont(values: &mut HostOrDeviceSlice<$field_name>) -> CudaError {
-                convert_scalars_montgomery(values, false)
+                $field_prefix_ident::convert_scalars_montgomery(values, false)
            }
        }
    };
--- a/wrappers/rust/icicle-core/src/msm/mod.rs
+++ b/wrappers/rust/icicle-core/src/msm/mod.rs
@@ -122,20 +122,25 @@ pub fn get_default_msm_config<C: Curve + MSM<C>>() -> MSMConfig<'static> {
 macro_rules! impl_msm {
    (
      $curve_prefix:literal,
+      $curve_prefix_indent:ident,
      $curve:ident
    ) => {
-        extern "C" {
-            #[link_name = concat!($curve_prefix, "MSMCuda")]
-            fn msm_cuda(
-                scalars: *const <$curve as Curve>::ScalarField,
-                points: *const Affine<$curve>,
-                count: i32,
-                config: &MSMConfig,
-                out: *mut Projective<$curve>,
-            ) -> CudaError;
+        mod $curve_prefix_indent {
+            use super::{$curve, Affine, CudaError, Curve, MSMConfig, Projective};

-            #[link_name = concat!($curve_prefix, "DefaultMSMConfig")]
-            fn default_msm_config() -> MSMConfig<'static>;
+            extern "C" {
+                #[link_name = concat!($curve_prefix, "MSMCuda")]
+                pub(crate) fn msm_cuda(
+                    scalars: *const <$curve as Curve>::ScalarField,
+                    points: *const Affine<$curve>,
+                    count: i32,
+                    config: &MSMConfig,
+                    out: *mut Projective<$curve>,
+                ) -> CudaError;
+
+                #[link_name = concat!($curve_prefix, "DefaultMSMConfig")]
+                pub(crate) fn default_msm_config() -> MSMConfig<'static>;
+            }
        }

        impl MSM<$curve> for $curve {
@@ -146,7 +151,7 @@ macro_rules! impl_msm {
                results: &mut HostOrDeviceSlice<Projective<$curve>>,
            ) -> IcicleResult<()> {
                unsafe {
-                    msm_cuda(
+                    $curve_prefix_indent::msm_cuda(
                        scalars.as_ptr(),
                        points.as_ptr(),
                        (scalars.len() / results.len()) as i32,
@@ -158,7 +163,7 @@ macro_rules! impl_msm {
            }

            fn get_default_msm_config() -> MSMConfig<'static> {
-                unsafe { default_msm_config() }
+                unsafe { $curve_prefix_indent::default_msm_config() }
            }
        }
    };
--- a/wrappers/rust/icicle-core/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-core/src/ntt/mod.rs
@@ -151,21 +151,26 @@ where
 macro_rules! impl_ntt {
    (
      $field_prefix:literal,
+      $field_prefix_ident:ident,
      $field:ident,
      $field_config:ident
    ) => {
-        extern "C" {
-            #[link_name = concat!($field_prefix, "NTTCuda")]
-            fn ntt_cuda(
-                input: *const $field,
-                size: i32,
-                dir: NTTDir,
-                config: &NTTConfig<$field>,
-                output: *mut $field,
-            ) -> CudaError;
+        mod $field_prefix_ident {
+            use crate::ntt::{$field, $field_config, CudaError, DeviceContext, NTTConfig, NTTDir};

-            #[link_name = concat!($field_prefix, "InitializeDomain")]
-            fn initialize_ntt_domain(primitive_root: $field, ctx: &DeviceContext) -> CudaError;
+            extern "C" {
+                #[link_name = concat!($field_prefix, "NTTCuda")]
+                pub(crate) fn ntt_cuda(
+                    input: *const $field,
+                    size: i32,
+                    dir: NTTDir,
+                    config: &NTTConfig<$field>,
+                    output: *mut $field,
+                ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "InitializeDomain")]
+                pub(crate) fn initialize_ntt_domain(primitive_root: $field, ctx: &DeviceContext) -> CudaError;
+            }
        }

        impl NTT<$field> for $field_config {
@@ -176,7 +181,7 @@ macro_rules! impl_ntt {
                output: &mut HostOrDeviceSlice<$field>,
            ) -> IcicleResult<()> {
                unsafe {
-                    ntt_cuda(
+                    $field_prefix_ident::ntt_cuda(
                        input.as_ptr(),
                        (input.len() / (cfg.batch_size as usize)) as i32,
                        dir,
@@ -188,7 +193,7 @@ macro_rules! impl_ntt {
            }

            fn initialize_domain(primitive_root: $field, ctx: &DeviceContext) -> IcicleResult<()> {
-                unsafe { initialize_ntt_domain(primitive_root, ctx).wrap() }
+                unsafe { $field_prefix_ident::initialize_ntt_domain(primitive_root, ctx).wrap() }
            }

            fn get_default_ntt_config() -> NTTConfig<'static, $field> {
--- a/wrappers/rust/icicle-core/src/ntt/tests.rs
+++ b/wrappers/rust/icicle-core/src/ntt/tests.rs
@@ -1,4 +1,4 @@
-use ark_ff::{FftField, One, PrimeField};
+use ark_ff::{FftField, Field as ArkField, One};
 use ark_poly::{EvaluationDomain, GeneralEvaluationDomain};
 use ark_std::{ops::Neg, test_rng, UniformRand};
 use icicle_cuda_runtime::device_context::get_default_device_context;
@@ -157,7 +157,7 @@ where

 pub fn check_ntt_arbitrary_coset<F: FieldImpl + ArkConvertible>()
 where
-    F::ArkEquivalent: FftField + PrimeField,
+    F::ArkEquivalent: FftField + ArkField,
    <F as FieldImpl>::Config: NTT<F> + GenerateRandom<F>,
 {
    let mut seed = test_rng();
@@ -179,7 +179,7 @@ where
            let mut ark_scalars = scalars
                .as_slice()
                .iter()
-                .map(|v| F::ArkEquivalent::from_le_bytes_mod_order(&v.to_bytes_le()))
+                .map(|v| F::ArkEquivalent::from_random_bytes(&v.to_bytes_le()).unwrap())
                .collect::<Vec<F::ArkEquivalent>>();

            let mut config = get_default_ntt_config();
--- a/wrappers/rust/icicle-core/src/traits.rs
+++ b/wrappers/rust/icicle-core/src/traits.rs
@@ -1,6 +1,6 @@
 use crate::error::IcicleResult;
 #[cfg(feature = "arkworks")]
-use ark_ff::PrimeField;
+use ark_ff::Field as ArkField;
 use icicle_cuda_runtime::{error::CudaError, memory::HostOrDeviceSlice};
 use std::{fmt::Debug, mem::MaybeUninit};

@@ -12,7 +12,7 @@ pub trait GenerateRandom<F> {
 #[doc(hidden)]
 pub trait FieldConfig: Debug + PartialEq + Copy + Clone {
    #[cfg(feature = "arkworks")]
-    type ArkField: PrimeField;
+    type ArkField: ArkField;
 }

 pub trait FieldImpl: Debug + PartialEq + Copy + Clone + Into<Self::Repr> + From<Self::Repr> {
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
@@ -26,4 +26,7 @@ icicle-bls12-377 = { path = ".", features = ["arkworks"] }

 [features]
 default = []
+bw6-761 = []
+bw6-761-g2 = ["bw6-761"]
+g2 = ["icicle-core/g2"]
 arkworks = ["ark-bls12-377", "icicle-core/arkworks"]
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs
@@ -4,16 +4,46 @@ fn main() {
    println!("cargo:rerun-if-env-changed=CXXFLAGS");
    println!("cargo:rerun-if-changed=../../../../icicle");

-    let out_dir = Config::new("../../../../icicle")
-                .define("BUILD_TESTS", "OFF") //TODO: feature
-                .define("CURVE", "bls12_377")
-                .define("CMAKE_BUILD_TYPE", "Release")
-                .build_target("icicle")
-                .build();
+    // Base config
+    let mut config = Config::new("../../../../icicle");
+    config
+        .define("BUILD_TESTS", "OFF")
+        .define("CURVE", "bls12_377")
+        .define("CMAKE_BUILD_TYPE", "Release");
+
+    // Optional Features
+    #[cfg(feature = "g2")]
+    config.define("G2_DEFINED", "ON");
+
+    // Build
+    let out_dir = config
+        .build_target("icicle")
+        .build();

    println!("cargo:rustc-link-search={}/build", out_dir.display());
-
    println!("cargo:rustc-link-lib=ingo_bls12_377");
+
+    if cfg!(feature = "bw6-761") {
+        // Base config
+        let mut config = Config::new("../../../../icicle");
+        config
+            .define("BUILD_TESTS", "OFF")
+            .define("CURVE", "bw6_761")
+            .define("CMAKE_BUILD_TYPE", "Release");
+
+        // Optional Features
+        #[cfg(feature = "bw6-761-g2")]
+        config.define("G2_DEFINED", "ON");
+
+        // Build
+        let out_dir = config
+            .build_target("icicle")
+            .build();
+
+        println!("cargo:rustc-link-search={}/build", out_dir.display());
+        println!("cargo:rustc-link-lib=ingo_bw6_761");
+    }
+
    println!("cargo:rustc-link-lib=stdc++");
    println!("cargo:rustc-link-lib=cudart");
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/curve.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/curve.rs
@@ -1,5 +1,7 @@
 #[cfg(feature = "arkworks")]
 use ark_bls12_377::{g1::Config as ArkG1Config, Fq, Fr};
+#[cfg(all(feature = "arkworks", feature = "g2"))]
+use ark_bls12_377::{g2::Config as ArkG2Config, Fq2};
 use icicle_core::curve::{Affine, Curve, Projective};
 use icicle_core::field::{Field, MontgomeryConvertibleField};
 use icicle_core::traits::{FieldConfig, FieldImpl, GenerateRandom};
@@ -10,21 +12,53 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 pub(crate) const SCALAR_LIMBS: usize = 4;
 pub(crate) const BASE_LIMBS: usize = 6;
+#[cfg(feature = "g2")]
+pub(crate) const G2_BASE_LIMBS: usize = 12;

-impl_scalar_field!("bls12_377", SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
+impl_scalar_field!("bls12_377", bls12_377_sf, SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
+#[cfg(feature = "bw6-761")]
+impl_scalar_field!("bw6_761", bw6_761_sf, BASE_LIMBS, BaseField, BaseCfg, Fq);
+#[cfg(not(feature = "bw6-761"))]
 impl_field!(BASE_LIMBS, BaseField, BaseCfg, Fq);
-impl_curve!("bls12_377", CurveCfg, ScalarField, BaseField);
+#[cfg(feature = "g2")]
+impl_field!(G2_BASE_LIMBS, G2BaseField, G2BaseCfg, Fq2);
+impl_curve!(
+    "bls12_377",
+    bls12_377,
+    CurveCfg,
+    ScalarField,
+    BaseField,
+    ArkG1Config,
+    G1Affine,
+    G1Projective
+);
+#[cfg(feature = "g2")]
+impl_curve!(
+    "bls12_377G2",
+    bls12_377_g2,
+    G2CurveCfg,
+    ScalarField,
+    G2BaseField,
+    ArkG2Config,
+    G2Affine,
+    G2Projective
+);

 #[cfg(test)]
 mod tests {
-    use super::ScalarField;
-    use super::{CurveCfg, BASE_LIMBS};
+    use super::{CurveCfg, ScalarField, BASE_LIMBS};
+    #[cfg(feature = "g2")]
+    use super::{G2CurveCfg, G2_BASE_LIMBS};
    use icicle_core::curve::Curve;
-    use icicle_core::impl_curve_tests;
-    use icicle_core::impl_field_tests;
    use icicle_core::tests::*;
    use icicle_core::traits::FieldImpl;
+    use icicle_core::{impl_curve_tests, impl_field_tests};

    impl_field_tests!(ScalarField);
    impl_curve_tests!(BASE_LIMBS, CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_curve_tests!(G2_BASE_LIMBS, G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/msm/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/msm/mod.rs
@@ -1,4 +1,6 @@
 use crate::curve::CurveCfg;
+#[cfg(feature = "g2")]
+use crate::curve::G2CurveCfg;
 use icicle_core::{
    curve::{Affine, Curve, Projective},
    error::IcicleResult,
@@ -9,14 +11,22 @@ use icicle_core::{
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_msm!("bls12_377", CurveCfg);
+impl_msm!("bls12_377", bls12_377, CurveCfg);
+#[cfg(feature = "g2")]
+impl_msm!("bls12_377G2", bls12_377_g2, G2CurveCfg);

 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::curve::CurveCfg;
+    #[cfg(feature = "g2")]
+    use crate::curve::G2CurveCfg;
    use icicle_core::impl_msm_tests;
    use icicle_core::msm::tests::*;

-    use crate::curve::CurveCfg;
-
    impl_msm_tests!(CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_msm_tests!(G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/ntt/mod.rs
@@ -1,3 +1,5 @@
+#[cfg(feature = "bw6-761")]
+use crate::curve::{BaseCfg, BaseField};
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
@@ -8,7 +10,9 @@ use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_ntt!("bls12_377", ScalarField, ScalarCfg);
+impl_ntt!("bls12_377", bls12_377, ScalarField, ScalarCfg);
+#[cfg(feature = "bw6-761")]
+impl_ntt!("bw6_761", bw6_761, BaseField, BaseCfg);

 #[cfg(test)]
 pub(crate) mod tests {
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
@@ -26,4 +26,5 @@ icicle-bls12-381 = { path = ".", features = ["arkworks"] }

 [features]
 default = []
+g2 = ["icicle-core/g2"]
 arkworks = ["ark-bls12-381", "icicle-core/arkworks"]
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs
@@ -4,12 +4,21 @@ fn main() {
    println!("cargo:rerun-if-env-changed=CXXFLAGS");
    println!("cargo:rerun-if-changed=../../../../icicle");

-    let out_dir = Config::new("../../../../icicle")
-                .define("BUILD_TESTS", "OFF") //TODO: feature
-                .define("CURVE", "bls12_381")
-                .define("CMAKE_BUILD_TYPE", "Release")
-                .build_target("icicle")
-                .build();
+    // Base config
+    let mut config = Config::new("../../../../icicle");
+    config
+        .define("BUILD_TESTS", "OFF")
+        .define("CURVE", "bls12_381")
+        .define("CMAKE_BUILD_TYPE", "Release");
+
+    // Optional Features
+    #[cfg(feature = "g2")]
+    config.define("G2_DEFINED", "ON");
+
+    // Build
+    let out_dir = config
+        .build_target("icicle")
+        .build();

    println!("cargo:rustc-link-search={}/build", out_dir.display());

--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/curve.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/curve.rs
@@ -1,5 +1,7 @@
 #[cfg(feature = "arkworks")]
 use ark_bls12_381::{g1::Config as ArkG1Config, Fq, Fr};
+#[cfg(all(feature = "arkworks", feature = "g2"))]
+use ark_bls12_381::{g2::Config as ArkG2Config, Fq2};
 use icicle_core::curve::{Affine, Curve, Projective};
 use icicle_core::field::{Field, MontgomeryConvertibleField};
 use icicle_core::traits::{FieldConfig, FieldImpl, GenerateRandom};
@@ -10,21 +12,50 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 pub(crate) const SCALAR_LIMBS: usize = 4;
 pub(crate) const BASE_LIMBS: usize = 6;
+#[cfg(feature = "g2")]
+pub(crate) const G2_BASE_LIMBS: usize = 12;

-impl_scalar_field!("bls12_381", SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
+impl_scalar_field!("bls12_381", bls12_381_sf, SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
 impl_field!(BASE_LIMBS, BaseField, BaseCfg, Fq);
-impl_curve!("bls12_381", CurveCfg, ScalarField, BaseField);
+#[cfg(feature = "g2")]
+impl_field!(G2_BASE_LIMBS, G2BaseField, G2BaseCfg, Fq2);
+impl_curve!(
+    "bls12_381",
+    bls12_381,
+    CurveCfg,
+    ScalarField,
+    BaseField,
+    ArkG1Config,
+    G1Affine,
+    G1Projective
+);
+#[cfg(feature = "g2")]
+impl_curve!(
+    "bls12_381G2",
+    bls12_381_g2,
+    G2CurveCfg,
+    ScalarField,
+    G2BaseField,
+    ArkG2Config,
+    G2Affine,
+    G2Projective
+);

 #[cfg(test)]
 mod tests {
-    use super::ScalarField;
-    use super::{CurveCfg, BASE_LIMBS};
+    use super::{CurveCfg, ScalarField, BASE_LIMBS};
+    #[cfg(feature = "g2")]
+    use super::{G2CurveCfg, G2_BASE_LIMBS};
    use icicle_core::curve::Curve;
-    use icicle_core::impl_curve_tests;
-    use icicle_core::impl_field_tests;
    use icicle_core::tests::*;
    use icicle_core::traits::FieldImpl;
+    use icicle_core::{impl_curve_tests, impl_field_tests};

    impl_field_tests!(ScalarField);
    impl_curve_tests!(BASE_LIMBS, CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_curve_tests!(G2_BASE_LIMBS, G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/msm/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/msm/mod.rs
@@ -1,4 +1,6 @@
 use crate::curve::CurveCfg;
+#[cfg(feature = "g2")]
+use crate::curve::G2CurveCfg;
 use icicle_core::{
    curve::{Affine, Curve, Projective},
    error::IcicleResult,
@@ -9,14 +11,22 @@ use icicle_core::{
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_msm!("bls12_381", CurveCfg);
+impl_msm!("bls12_381", bls12_381, CurveCfg);
+#[cfg(feature = "g2")]
+impl_msm!("bls12_381G2", bls12_381_g2, G2CurveCfg);

 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::curve::CurveCfg;
+    #[cfg(feature = "g2")]
+    use crate::curve::G2CurveCfg;
    use icicle_core::impl_msm_tests;
    use icicle_core::msm::tests::*;

-    use crate::curve::CurveCfg;
-
    impl_msm_tests!(CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_msm_tests!(G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/ntt/mod.rs
@@ -8,7 +8,7 @@ use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_ntt!("bls12_381", ScalarField, ScalarCfg);
+impl_ntt!("bls12_381", bls12_381, ScalarField, ScalarCfg);

 #[cfg(test)]
 pub(crate) mod tests {
--- a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
@@ -26,4 +26,5 @@ icicle-bn254 = { path = ".", features = ["arkworks"] }

 [features]
 default = []
+g2 = ["icicle-core/g2"]
 arkworks = ["ark-bn254", "icicle-core/arkworks"]
--- a/wrappers/rust/icicle-curves/icicle-bn254/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/build.rs
@@ -4,12 +4,21 @@ fn main() {
    println!("cargo:rerun-if-env-changed=CXXFLAGS");
    println!("cargo:rerun-if-changed=../../../../icicle");

-    let out_dir = Config::new("../../../../icicle")
-                .define("BUILD_TESTS", "OFF") //TODO: feature
-                .define("CURVE", "bn254")
-                .define("CMAKE_BUILD_TYPE", "Release")
-                .build_target("icicle")
-                .build();
+    // Base config
+    let mut config = Config::new("../../../../icicle");
+    config
+        .define("BUILD_TESTS", "OFF")
+        .define("CURVE", "bn254")
+        .define("CMAKE_BUILD_TYPE", "Release");
+
+    // Optional Features
+    #[cfg(feature = "g2")]
+    config.define("G2_DEFINED", "ON");
+
+    // Build
+    let out_dir = config
+        .build_target("icicle")
+        .build();

    println!("cargo:rustc-link-search={}/build", out_dir.display());

--- a/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs
@@ -1,5 +1,7 @@
 #[cfg(feature = "arkworks")]
 use ark_bn254::{g1::Config as ArkG1Config, Fq, Fr};
+#[cfg(all(feature = "arkworks", feature = "g2"))]
+use ark_bn254::{g2::Config as ArkG2Config, Fq2};
 use icicle_core::curve::{Affine, Curve, Projective};
 use icicle_core::field::{Field, MontgomeryConvertibleField};
 use icicle_core::traits::{FieldConfig, FieldImpl, GenerateRandom};
@@ -10,21 +12,50 @@ use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 pub(crate) const SCALAR_LIMBS: usize = 4;
 pub(crate) const BASE_LIMBS: usize = 4;
+#[cfg(feature = "g2")]
+pub(crate) const G2_BASE_LIMBS: usize = 8;

-impl_scalar_field!("bn254", SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
+impl_scalar_field!("bn254", bn254_sf, SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
 impl_field!(BASE_LIMBS, BaseField, BaseCfg, Fq);
-impl_curve!("bn254", CurveCfg, ScalarField, BaseField);
+#[cfg(feature = "g2")]
+impl_field!(G2_BASE_LIMBS, G2BaseField, G2BaseCfg, Fq2);
+impl_curve!(
+    "bn254",
+    bn254,
+    CurveCfg,
+    ScalarField,
+    BaseField,
+    ArkG1Config,
+    G1Affine,
+    G1Projective
+);
+#[cfg(feature = "g2")]
+impl_curve!(
+    "bn254G2",
+    bn254_g2,
+    G2CurveCfg,
+    ScalarField,
+    G2BaseField,
+    ArkG2Config,
+    G2Affine,
+    G2Projective
+);

 #[cfg(test)]
 mod tests {
-    use super::ScalarField;
-    use super::{CurveCfg, BASE_LIMBS};
+    use super::{CurveCfg, ScalarField, BASE_LIMBS};
+    #[cfg(feature = "g2")]
+    use super::{G2CurveCfg, G2_BASE_LIMBS};
    use icicle_core::curve::Curve;
-    use icicle_core::impl_curve_tests;
-    use icicle_core::impl_field_tests;
    use icicle_core::tests::*;
    use icicle_core::traits::FieldImpl;
+    use icicle_core::{impl_curve_tests, impl_field_tests};

    impl_field_tests!(ScalarField);
    impl_curve_tests!(BASE_LIMBS, CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_curve_tests!(G2_BASE_LIMBS, G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/msm/mod.rs
@@ -1,4 +1,6 @@
 use crate::curve::CurveCfg;
+#[cfg(feature = "g2")]
+use crate::curve::G2CurveCfg;
 use icicle_core::{
    curve::{Affine, Curve, Projective},
    error::IcicleResult,
@@ -8,14 +10,22 @@ use icicle_core::{
 };
 use icicle_cuda_runtime::{error::CudaError, memory::HostOrDeviceSlice};

-impl_msm!("bn254", CurveCfg);
+impl_msm!("bn254", bn254, CurveCfg);
+#[cfg(feature = "g2")]
+impl_msm!("bn254G2", bn254_g2, G2CurveCfg);

 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::curve::CurveCfg;
+    #[cfg(feature = "g2")]
+    use crate::curve::G2CurveCfg;
    use icicle_core::impl_msm_tests;
    use icicle_core::msm::tests::*;

-    use crate::curve::CurveCfg;
-
    impl_msm_tests!(CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_msm_tests!(G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs
@@ -8,7 +8,7 @@ use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_ntt!("bn254", ScalarField, ScalarCfg);
+impl_ntt!("bn254", bn254, ScalarField, ScalarCfg);

 #[cfg(test)]
 pub(crate) mod tests {
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/Cargo.toml
@@ -3,13 +3,14 @@ name = "icicle-bw6-761"
 version = "1.0.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
-description = "Rust wrapper for the CUDA implementation of BN254 pairing friendly elliptic curve by Ingonyama"
+description = "Rust wrapper for the CUDA implementation of BW6-761 pairing friendly elliptic curve by Ingonyama"
 homepage = "https://www.ingonyama.com"
 repository = "https://github.com/ingonyama-zk/icicle"

 [dependencies]
 icicle-core = { path = "../../icicle-core" }
 icicle-cuda-runtime = { path = "../../icicle-cuda-runtime" }
+icicle-bls12-377 = { path = "../../icicle-curves/icicle-bls12-377", features = ["bw6-761"] }
 ark-bw6-761 = { version = "0.4.0", optional = true }

 [build-dependencies]
@@ -26,4 +27,5 @@ icicle-bw6-761 = { path = ".", features = ["arkworks"] }

 [features]
 default = []
-arkworks = ["ark-bw6-761", "icicle-core/arkworks"]
+g2 = ["icicle-bls12-377/bw6-761-g2"]
+arkworks = ["ark-bw6-761", "icicle-core/arkworks", "icicle-bls12-377/arkworks"]
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs
@@ -1,19 +0,0 @@
-use cmake::Config;
-
-fn main() {
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=../../../../icicle");
-
-    let out_dir = Config::new("../../../../icicle")
-                .define("BUILD_TESTS", "OFF") //TODO: feature
-                .define("CURVE", "bw6_761")
-                .define("CMAKE_BUILD_TYPE", "Release")
-                .build_target("icicle")
-                .build();
-
-    println!("cargo:rustc-link-search={}/build", out_dir.display());
-
-    println!("cargo:rustc-link-lib=ingo_bw6_761");
-    println!("cargo:rustc-link-lib=stdc++");
-    println!("cargo:rustc-link-lib=cudart");
-}
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/src/curve.rs
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/src/curve.rs
@@ -1,30 +1,57 @@
+#[cfg(all(feature = "arkworks", feature = "g2"))]
+use ark_bw6_761::g2::Config as ArkG2Config;
 #[cfg(feature = "arkworks")]
-use ark_bw6_761::{g1::Config as ArkG1Config, Fq, Fr};
+use ark_bw6_761::{g1::Config as ArkG1Config, Fq};
+use icicle_bls12_377::curve::BaseField as bls12_377BaseField;
 use icicle_core::curve::{Affine, Curve, Projective};
-use icicle_core::field::{Field, MontgomeryConvertibleField};
-use icicle_core::traits::{FieldConfig, FieldImpl, GenerateRandom};
-use icicle_core::{impl_curve, impl_field, impl_scalar_field};
+use icicle_core::field::Field;
+use icicle_core::traits::FieldConfig;
+use icicle_core::{impl_curve, impl_field};
 use icicle_cuda_runtime::device_context::{get_default_device_context, DeviceContext};
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-pub(crate) const SCALAR_LIMBS: usize = 6;
 pub(crate) const BASE_LIMBS: usize = 12;

-impl_scalar_field!("bw6_761", SCALAR_LIMBS, ScalarField, ScalarCfg, Fr);
 impl_field!(BASE_LIMBS, BaseField, BaseCfg, Fq);
-impl_curve!("bw6_761", CurveCfg, ScalarField, BaseField);
+pub type ScalarField = bls12_377BaseField;
+impl_curve!(
+    "bw6_761",
+    bw6_761,
+    CurveCfg,
+    ScalarField,
+    BaseField,
+    ArkG1Config,
+    G1Affine,
+    G1Projective
+);
+#[cfg(feature = "g2")]
+impl_curve!(
+    "bw6_761G2",
+    bw6_761_g2,
+    G2CurveCfg,
+    ScalarField,
+    BaseField,
+    ArkG2Config,
+    G2Affine,
+    G2Projective
+);

 #[cfg(test)]
 mod tests {
-    use super::ScalarField;
-    use super::{CurveCfg, BASE_LIMBS};
+    #[cfg(feature = "g2")]
+    use super::G2CurveCfg;
+    use super::{CurveCfg, ScalarField, BASE_LIMBS};
    use icicle_core::curve::Curve;
-    use icicle_core::impl_curve_tests;
-    use icicle_core::impl_field_tests;
    use icicle_core::tests::*;
    use icicle_core::traits::FieldImpl;
+    use icicle_core::{impl_curve_tests, impl_field_tests};

    impl_field_tests!(ScalarField);
    impl_curve_tests!(BASE_LIMBS, CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_curve_tests!(BASE_LIMBS, G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/src/msm/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/src/msm/mod.rs
@@ -1,4 +1,6 @@
 use crate::curve::CurveCfg;
+#[cfg(feature = "g2")]
+use crate::curve::G2CurveCfg;
 use icicle_core::{
    curve::{Affine, Curve, Projective},
    error::IcicleResult,
@@ -9,14 +11,22 @@ use icicle_core::{
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;

-impl_msm!("bw6_761", CurveCfg);
+impl_msm!("bw6_761", bw6_761, CurveCfg);
+#[cfg(feature = "g2")]
+impl_msm!("bw6_761G2", bw6_761_g2, G2CurveCfg);

 #[cfg(test)]
 pub(crate) mod tests {
+    use crate::curve::CurveCfg;
+    #[cfg(feature = "g2")]
+    use crate::curve::G2CurveCfg;
    use icicle_core::impl_msm_tests;
    use icicle_core::msm::tests::*;

-    use crate::curve::CurveCfg;
-
    impl_msm_tests!(CurveCfg);
+    #[cfg(feature = "g2")]
+    mod g2 {
+        use super::*;
+        impl_msm_tests!(G2CurveCfg);
+    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/src/ntt/mod.rs
@@ -1,15 +1,3 @@
-use crate::curve::{ScalarCfg, ScalarField};
-
-use icicle_core::error::IcicleResult;
-use icicle_core::impl_ntt;
-use icicle_core::ntt::{NTTConfig, NTTDir, NTT};
-use icicle_core::traits::IcicleResultWrap;
-use icicle_cuda_runtime::device_context::DeviceContext;
-use icicle_cuda_runtime::error::CudaError;
-use icicle_cuda_runtime::memory::HostOrDeviceSlice;
-
-impl_ntt!("bw6_761", ScalarField, ScalarCfg);
-
 #[cfg(test)]
 pub(crate) mod tests {
    use crate::curve::ScalarField;
Author	SHA1	Message	Date
stas	69d0779c16	added G2 to MSM example	2024-01-23 17:11:21 -05:00
stas	368a5eb72a	Merge remote-tracking branch 'origin/develop/dima/g2_and_bw_scalar_field' into update-examples-for-new-API	2024-01-23 12:05:09 -05:00
DmytroTym	c5ff8ceda7	test-threads in CI and msm check corrected	2024-01-23 16:06:24 +02:00
DmytroTym	36ace1dac8	cargo fmt	2024-01-23 14:38:47 +02:00
DmytroTym	c7cff01452	After merge fix	2024-01-23 14:12:25 +02:00
DmytroTym	b7c861586e	Merge dev into develop/dima/g2_and_bw_scalar_field	2024-01-23 12:26:21 +02:00
DmytroTym	e58b472882	enable g2 and remove test-threads=1 in CI	2024-01-22 19:03:06 +02:00
DmytroTym	690700f132	Address comments + ntt size check	2024-01-22 18:58:52 +02:00
stas	d9cdd83d44	dev merge	2024-01-21 13:37:09 -05:00
stas	0249ea9d8b	merged with dev	2024-01-21 13:37:00 -05:00
stas	a98ccf6cf7	updated paths in CMAKE configs to compile outside the container	2024-01-21 11:43:20 -05:00
LeonHibnik	8474d93461	update rust msm and ntt examples	2024-01-17 21:54:27 +02:00
stas	f9e7f49030	updated c++ NTT example	2024-01-12 17:25:36 -05:00
DmytroTym	73b4ff9968	Some C++-side comments	2024-01-12 22:29:14 +02:00
DmytroTym	566fd37296	fmt	2024-01-12 14:56:53 +02:00
stas	39acedf5cb	ntt example - memory issues	2024-01-12 07:43:50 -05:00
DmytroTym	6144e519eb	G2 MSM fixed, Rust side G2 wrappers added	2024-01-12 14:13:29 +02:00
stas	02d2b04d7e	update c++ multiply	2024-01-11 13:59:29 -05:00
DmytroTym	0376fb94c9	Merge commit '67586e01f87b76f55a988ffc18da143a261e7f5f' into develop/dima/g2_and_bw_scalar_field	2024-01-11 18:34:47 +02:00
Otsar	c1a32a9879	Update README.md (#339 ) Added badge	2024-01-11 18:34:32 +02:00
DmytroTym	320140d1e8	cmake g2	2024-01-11 18:32:21 +02:00
DmytroTym	3f3a8db5c7	G2 feature in Rust	2024-01-11 18:30:39 +02:00
DmytroTym	f2c1c45511	BW scalar field is now the same as BLS base field	2024-01-10 18:32:52 +02:00