Release v1.5.0 (#393)

# Contents of this release Examples: multi-gpu example #381 Examples: updates example compares Radix2 and MixedRadix NTTs #383 Feat: add vector operations bindings to Rust #384 Examples: update examples with new vec ops #388 Feat: Grumpkin curve implementation #379 Feat: mixed-radix NTT fast twiddles mode #382 Docs: Update README.md #385 #387 README: Update Hall of Fame section #394 Examples: add rust poseidon example #392 Feat: GoLang bindings for v1.x #386
2026-01-08 23:17:54 -05:00 · 2024-02-23 10:15:18 +02:00
parent fc6badcb35 e8cd2d7a98
commit e6035698b5
281 changed files with 23885 additions and 11326 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -1,5 +1,7 @@
 golang:
-  - goicicle/**/*.go'
+  - wrappers/golang/**/*.go'
+  - wrappers/golang/**/*.h'
+  - wrappers/golang/**/*.tmpl'
  - go.mod
 rust:
  - wrappers/rust
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -23,7 +23,7 @@ concurrency:

 jobs:  
  test-examples:
-    runs-on: [self-hosted, Linux, X64, icicle] # ubuntu-latest
+    runs-on: [self-hosted, Linux, X64, icicle, examples]
    steps:
    - name: Checkout
      uses: actions/checkout@v2
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -80,18 +80,22 @@ jobs:
      # Building from the root workspace will build all members of the workspace by default
      run: cargo build --release --verbose

-  # TODO: Re-enable once Golang bindings for v1+ is finished
-  # build-golang-linux:
-  #   name: Build Golang on Linux
-  #   runs-on: [self-hosted, Linux, X64, icicle]
-  #   needs: check-changed-files
-  #   steps:
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Build CUDA libs
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: make all
-  #     working-directory: ./goicicle
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/golang
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh ${{ matrix.curve }} ON

  # TODO: Add once Golang make file supports building for Windows
  # build-golang-windows:
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -75,20 +75,25 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: ctest
  
-  # TODO: Re-enable once Golang bindings for v1+ is finished
-  # test-golang-linux:
-  #   name: Test Golang on Linux
-  #   runs-on: [self-hosted, Linux, X64, icicle]
-  #   needs: check-changed-files
-  #   steps:
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Build CUDA libs
-  #     working-directory: ./goicicle
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: make libbn254.so
-  #   - name: Run Golang Tests
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: |
-  #       export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
-  #       go test ./goicicle/curves/bn254 -count=1
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    # strategy:
+    #   matrix:
+    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # builds all curves with g2 ON
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh all ON
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -timeout 60m
--- a/README.md
+++ b/README.md
@@ -114,6 +114,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
 - [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE

 ## Help & Support

@@ -142,10 +143,10 @@ See [LICENSE-MIT][LMIT] for details.
 [GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
 [ICICLE-CORE]: ./icicle/
 [ICICLE-RUST]: ./wrappers/rust/
-[ICICLE-GO]: ./goicicle/
+[ICICLE-GO]: ./wrappers/golang/
 [ICICLE-CORE-README]: ./icicle/README.md
 [ICICLE-RUST-README]: ./wrappers/rust/README.md
-[ICICLE-GO-README]: ./goicicle/README.md
+[ICICLE-GO-README]: ./wrappers/golang/README.md
 [documentation]: https://dev.ingonyama.com/icicle/overview
 [examples]: ./examples/

--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -0,0 +1,52 @@
+# Icicle example: using multiple GPU to hash large dataset
+
+## Best-Practices
+
+This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+Use `device_context::DeviceContext` variable to select GPU to use. 
+Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
+
+## Concise Usage Explanation
+
+1. Include c++ threads
+
+```c++
+#include <thread>
+```
+
+2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
+
+```c++
+void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
+```
+
+3. Initialize device contexts for different GPUs
+
+```c++
+device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+ctx0.device_id=0;
+device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+ctx1.device_id=1;
+``` 
+
+4. Finally, spawn the threads and wait for their completion
+
+```c++
+std::thread thread0(threadPoseidon, ctx0, ...);
+std::thread thread1(threadPoseidon, ctx1, ...);
+thread0.join();
+thread1.join();
+```
+
+## What's in the example
+
+This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
+
+1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
+2. Hash two partitions in parallel on two GPUs
+3. Hash two partitions in series on one GPU
+4. Compare execution times
+
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -0,0 +1,148 @@
+#include <iostream>
+#include <thread>
+#include <chrono>
+
+#include <nvml.h>
+
+// select the curve
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"
+
+using namespace poseidon;
+using namespace curve_config;
+
+void checkCudaError(cudaError_t error) {
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        // Handle the error, e.g., exit the program or throw an exception.
+    }
+}
+
+// these global constants go into template calls
+const int size_col = 11;
+
+// this function executes the Poseidon thread
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    // CHK_IF_RETURN(); I can't use it in a standard thread function
+    PoseidonConfig column_config = {
+        ctx,   // ctx
+        false, // are_inputes_on_device
+        false, // are_outputs_on_device
+        false, // input_is_a_state
+        false, // aligned
+        false, // loop_state
+        false, // is_async
+        };
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
+    checkCudaError(err);
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
+int main() {
+    const unsigned size_row = (1<<30);
+    const unsigned nof_partitions = 64;
+    const unsigned size_partition = size_row / nof_partitions;
+    // layers is allocated only for one partition, need to reuse for different partitions
+    const uint32_t size_layers = size_col * size_partition;
+    
+    nvmlInit();
+    unsigned int deviceCount;
+    nvmlDeviceGetCount(&deviceCount);
+    std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+    for (unsigned int i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        nvmlMemory_t memory;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        nvmlDeviceGetHandleByIndex(i, &device);
+        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+        nvmlDeviceGetMemoryInfo(device, &memory);
+        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
+    }
+
+    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
+    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+    //===============================================================================
+    // Key: multiple devices are supported by device context
+    //===============================================================================
+
+    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+    ctx0.device_id=0;
+    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+    ctx1.device_id=1;
+    
+    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers0);
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers1);
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
+
+    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash0);
+    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash1);
+
+    PoseidonConstants<scalar_t> column_constants0, column_constants1;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
+
+    std::cout << "Parallel execution of Poseidon threads" << std::endl;
+    START_TIMER(parallel);
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+    // Wait for the threads to finish
+    thread0.join();
+    thread1.join();
+    END_TIMER(parallel,"2 GPUs");
+    std::cout << "Output Data from Thread 0: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 1: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    std::cout << "Sequential execution of Poseidon threads" << std::endl;
+    START_TIMER(sequential);
+    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    thread2.join();
+    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+    thread3.join();
+    END_TIMER(sequential,"1 GPU");
+    std::cout << "Output Data from Thread 2: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 3: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    nvmlShutdown();
+    return 0;
+}
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ b/examples/c++/multi-gpu-poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -10,15 +10,15 @@

 using namespace curve_config;

-// select scalar or point field
-//typedef scalar_t T;
-typedef point_field_t T;
+typedef scalar_t T;

 int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
-  const bool is_on_device = true;
-  const bool is_montgomery = false;
-  cudaError_t err =  vec_ops::Mul<T,T>(vec_a, vec_b, n_elments, is_on_device, is_montgomery, ctx, vec_result);
+  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
+  config.is_a_on_device = true;
+  config.is_b_on_device = true;
+  config.is_result_on_device = true;
+  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -16,10 +16,11 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
 // Include NTT template
 #include "appUtils/ntt/ntt.cu"
 using namespace curve_config;
+using namespace ntt;
 // Configure NTT
-ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
+NTTConfig<S> config=DefaultNTTConfig<S>();
 // Call NTT
-ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
+NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
 ```

 ## Running the example
@@ -28,5 +29,10 @@ ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
 - compile with  `./compile.sh`
 - run with `./run.sh`

+## What's in the example

-
+1. Define the size of the example
+2. Initialize input
+3. Run Radix2 NTT
+4. Run MixedRadix NTT
+5. Validate the data output
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -7,6 +7,7 @@
 #include "appUtils/ntt/ntt.cu"
 #include "appUtils/ntt/kernel_ntt.cu"
 using namespace curve_config;
+using namespace ntt;

 // Operate on scalars
 typedef scalar_t S;
@@ -58,6 +59,11 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
  return nof_errors;
 }

+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
 int main(int argc, char* argv[])
 {
  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
@@ -78,24 +84,30 @@ int main(int argc, char* argv[])
  output = (E*)malloc(sizeof(E) * batch_size);

  std::cout << "Running NTT with on-host data" << std::endl;
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
  // Create a device context
  auto ctx = device_context::get_default_device_context();
-  // the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
-  S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
-  ntt::InitDomain(rou, ctx);
+  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
+  InitDomain(basic_root, ctx);
  // Create an NTTConfig instance
-  ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
+  NTTConfig<S> config = DefaultNTTConfig<S>();
+  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
  config.batch_size = nof_ntts;
-  config.ctx.stream = stream;
-  auto begin0 = std::chrono::high_resolution_clock::now();
-  cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
-  auto end0 = std::chrono::high_resolution_clock::now();
-  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
-  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
+  START_TIMER(MixedRadix);
+  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(MixedRadix, "MixedRadix NTT");
+  
+  std::cout << "Validating output" << std::endl;
  validate_output(ntt_size, nof_ntts, output);
-  cudaStreamDestroy(stream);
+
+  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  START_TIMER(Radix2);
+  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(Radix2, "Radix2 NTT");
+
+  std::cout << "Validating output" << std::endl;
+  validate_output(ntt_size, nof_ntts, output);
+
+  std::cout << "Cleaning-up memory" << std::endl;
  free(input);
  free(output);
  return 0;
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -56,7 +56,7 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaEventCreate(&stop));

  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::InitDomain(basic_root, ntt_config.ctx);
+  ntt::InitDomain(basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);

  // (1) cpu allocation
  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
@@ -84,8 +84,16 @@ int main(int argc, char** argv)

      // (4) multiply A,B
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+      vec_ops::VecOpsConfig<test_data> config {
+        ntt_config.ctx,
+        true,  // is_a_on_device
+        true,  // is_b_on_device
+        true,  // is_result_on_device
+        false, // is_montgomery
+        false  // is_async
+      };
      CHK_IF_RETURN(
-        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, true /*=is_on_device*/, false /*=is_montgomery*/, ntt_config.ctx, MulGpu));
+        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));

      // (5) INTT (in place)
      ntt_config.are_inputs_on_device = true;
--- a/examples/rust/poseidon/.devcontainer/Dockerfile
+++ b/examples/rust/poseidon/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/poseidon/.devcontainer/devcontainer.json
+++ b/examples/rust/poseidon/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust poseidon",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/poseidon/Cargo.toml
+++ b/examples/rust/poseidon/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "poseidon"
+version = "1.4.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-bls12-381 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-381" }
+
+clap = { version = "<=4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -0,0 +1,53 @@
+use icicle_bls12_381::curve::ScalarField as F;
+
+use icicle_cuda_runtime::device_context::DeviceContext;
+
+use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
+use icicle_core::traits::FieldImpl;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of Poseidon input to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    let size = args.size;
+    let test_size = 1 << size;
+
+    println!("Running Icicle Examples: Rust Poseidon Hash");
+    let arity = 2u32;
+    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
+    let ctx = DeviceContext::default();
+    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
+    let config = PoseidonConfig::default();
+
+    println!("---------------------- Input size 2^{}={} ------------------------", size, test_size);
+    let inputs = vec![F::one(); test_size * arity as usize];
+    let outputs = vec![F::zero(); test_size];
+    let mut input_slice = HostOrDeviceSlice::on_host(inputs);
+    let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+
+    println!("Executing BLS12-381 Poseidon Hash on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    poseidon_hash_many::<F>(
+        &mut input_slice,
+        &mut output_slice,
+        test_size as u32,
+        arity as u32,
+        &constants,
+        &config,
+    )
+    .unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
+}
--- a/go.mod
+++ b/go.mod
@@ -3,15 +3,19 @@ module github.com/ingonyama-zk/icicle
 go 1.20

 require (
-	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/kr/pretty v0.1.0 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
-	gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
-	gopkg.in/yaml.v3 v3.0.1 // indirect
+	github.com/consensys/gnark-crypto v0.12.1
+	github.com/stretchr/testify v1.8.2
 )

 require (
-	github.com/consensys/bavard v0.1.13
-	github.com/stretchr/testify v1.8.3
+	github.com/bits-and-blooms/bitset v1.7.0 // indirect
+	github.com/consensys/bavard v0.1.13 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/kr/text v0.2.0 // indirect
+	github.com/mmcloughlin/addchain v0.4.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/rogpeppe/go-internal v1.12.0 // indirect
+	golang.org/x/sys v0.9.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
 	rsc.io/tmplfunc v0.0.3 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,19 +1,37 @@
+github.com/bits-and-blooms/bitset v1.7.0 h1:YjAGVd3XmtK9ktAbX8Zg2g2PwLIMjGREZJHlV4j7NEo=
+github.com/bits-and-blooms/bitset v1.7.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
 github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
 github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
+github.com/consensys/gnark-crypto v0.12.1 h1:lHH39WuuFgVHONRl3J0LRBtuYdQTumFSDtJF7HpyG8M=
+github.com/consensys/gnark-crypto v0.12.1/go.mod h1:v2Gy7L/4ZRosZ7Ivs+9SfUDr0f5UlG+EM5t7MPHiLuY=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
-github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
+github.com/mmcloughlin/addchain v0.4.0 h1:SobOdjm2xLj1KkXN5/n0xTIWyZA2+s99UCY1iPfkHRY=
+github.com/mmcloughlin/addchain v0.4.0/go.mod h1:A86O+tHqZLMNO4w6ZZ4FlVQEadcoqkyU72HC5wJ4RlU=
+github.com/mmcloughlin/profile v0.1.1/go.mod h1:IhHD7q1ooxgwTgjxQYkACGA77oFTDdFVejUS1/tS/qU=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
-github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
+github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
+github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
+golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
--- a/goicicle/Makefile
+++ b/goicicle/Makefile
@@ -1,34 +0,0 @@
-CUDA_ROOT_DIR = /usr/local/cuda
-NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
-CFLAGS = -Xcompiler -fPIC -std=c++17
-LDFLAGS = -shared
-FEATURES = -DG2_DEFINED
-
-TARGET_BN254 = libbn254.so
-TARGET_BW6761 = libbw6761.so
-TARGET_BLS12_381 = libbls12_381.so
-TARGET_BLS12_377 = libbls12_377.so
-
-VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761
-
-SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
-SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu
-SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
-SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
-
-all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
-
-$(TARGET_BN254): 
-	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
-
-$(TARGET_BW6761): 
-	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@
-
-$(TARGET_BLS12_381):
-	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
-
-$(TARGET_BLS12_377):
-	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
-
-clean:
-	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
--- a/goicicle/README.md
+++ b/goicicle/README.md
@@ -1,82 +0,0 @@
-# Golang Bindings
-
-To build the shared library:
-
-To build shared libraries for all supported curves.
-
-```
-make all
-```
-
-If you wish to build for a specific curve, for example bn254.
-
-```
-make libbn254.so
-```
-
-The current supported options are `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`. The resulting `.so` files are the compiled shared libraries for each curve.
-
-Finally to allow your system to find the shared libraries
-
-```
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/<path_to_shared_libs>
-```
-
-## Running golang tests
-
-To run the tests for curve bn254.
-
-```
-go test ./goicicle/curves/bn254 -count=1
-```
-
-## Cleaning up
-
-If you want to remove the compiled files
-
-```
-make clean
-```
-
-This will remove all shared libraries generated from the `make` file.
-
-# How do Golang bindings work?
-
-The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
-
-1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
-
-2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-    // Now you can call the C functions from the ICICLE libraries.
-    // Note that C function calls are prefixed with 'C.' in Go code.
-}
-```
-
-Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
-
-# Common issues
-
-### Cannot find shared library
-
-In some cases you may encounter the following error, despite exporting the correct `LD_LIBRARY_PATH`.
-
-```
-/usr/local/go/pkg/tool/linux_amd64/link: running gcc failed: exit status 1
-/usr/bin/ld: cannot find -lbn254: No such file or directory
-/usr/bin/ld: cannot find -lbn254: No such file or directory
-/usr/bin/ld: cannot find -lbn254: No such file or directory
-/usr/bin/ld: cannot find -lbn254: No such file or directory
-/usr/bin/ld: cannot find -lbn254: No such file or directory
-collect2: error: ld returned 1 exit status
-```
-
-This is normally fixed by exporting the path to the shared library location in the following way: `export CGO_LDFLAGS="-L/<path_to_shared_lib>/"`
--- a/goicicle/curves/bls12377/g1.go
+++ b/goicicle/curves/bls12377/g1.go
@@ -1,328 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"unsafe"
-
-	"encoding/binary"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-const SCALAR_SIZE = 8
-const BASE_SIZE = 12
-
-type G1ScalarField struct {
-	S [SCALAR_SIZE]uint32
-}
-
-type G1BaseField struct {
-	S [BASE_SIZE]uint32
-}
-
-/*
- * BaseField Constructors
- */
-
-func (f *G1BaseField) SetZero() *G1BaseField {
-	var S [BASE_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1BaseField) SetOne() *G1BaseField {
-	var S [BASE_SIZE]uint32
-
-	S[0] = 1
-
-	f.S = S
-	return f
-}
-
-func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
-	out := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_377_affine_t)(unsafe.Pointer(affine))
-
-	C.projective_from_affine_bls12_377(out, in)
-
-	return p
-}
-
-func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
-	copy(f.S[:], limbs[:])
-
-	return f
-}
-
-/*
- * BaseField methods
- */
-
-func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1BaseField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * ScalarField methods
- */
-
-func (p *G1ScalarField) Random() *G1ScalarField {
-	outC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(p))
-	C.random_scalar_bls12_377(outC)
-
-	return p
-}
-
-func (f *G1ScalarField) SetZero() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1ScalarField) SetOne() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	S[0] = 1
-	f.S = S
-
-	return f
-}
-
-func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
-	for i, v := range a.S {
-		if b.S[i] != v {
-			return false
-		}
-	}
-	return true
-}
-
-/*
- * ScalarField methods
- */
-
-func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1ScalarField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * PointBLS12_377
- */
-
-type G1ProjectivePoint struct {
-	X, Y, Z G1BaseField
-}
-
-func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
-	var yOne G1BaseField
-	yOne.SetOne()
-
-	var xZero G1BaseField
-	xZero.SetZero()
-
-	var zZero G1BaseField
-	zZero.SetZero()
-
-	f.X = xZero
-	f.Y = yOne
-	f.Z = zZero
-
-	return f
-}
-
-func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
-	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It'S your responsibility to ensure that the types are compatible.
-	pC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BLS12_377_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it'S fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_bls12_377(pC, pCompareC))
-}
-
-func (p *G1ProjectivePoint) IsOnCurve() bool {
-	point := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
-	res := C.projective_is_on_curve_bls12_377(point)
-
-	return bool(res)
-}
-
-func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
-	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
-	C.random_projective_bls12_377(outC)
-
-	return p
-}
-
-func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
-	return &G1PointAffine{
-		X: p.X,
-		Y: p.Y,
-	}
-}
-
-func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
-	var _x G1BaseField
-	var _y G1BaseField
-	var _z G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(x))
-	_y.FromLimbs(GetFixedLimbs(y))
-	_z.FromLimbs(GetFixedLimbs(z))
-
-	p.X = _x
-	p.Y = _y
-	p.Z = _z
-
-	return p
-}
-
-/*
- * PointAffineNoInfinityBLS12_377
- */
-
-type G1PointAffine struct {
-	X, Y G1BaseField
-}
-
-func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
-	in := (*C.BLS12_377_projective_t)(unsafe.Pointer(projective))
-	out := (*C.BLS12_377_affine_t)(unsafe.Pointer(p))
-
-	C.projective_to_affine_bls12_377(out, in)
-
-	return p
-}
-
-func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
-	var Z G1BaseField
-	Z.SetOne()
-
-	return &G1ProjectivePoint{
-		X: p.X,
-		Y: p.Y,
-		Z: Z,
-	}
-}
-
-func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
-	var _x G1BaseField
-	var _y G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(X))
-	_y.FromLimbs(GetFixedLimbs(Y))
-
-	p.X = _x
-	p.Y = _y
-
-	return p
-}
-
-/*
- * Multiplication
- */
-
-func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	pointsC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&a[0]))
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_point_bls12_377(pointsC, scalarsC, nElementsC, deviceIdC)
-}
-
-func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_scalar_bls12_377(aC, bC, nElementsC, deviceIdC)
-}
-
-// Multiply a matrix by a scalar:
-//
-//	`a` - flattenned matrix;
-//	`b` - vector to multiply `a` by;
-func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	c := make([]G1ScalarField, len(b))
-	for i := range c {
-		var p G1ScalarField
-		p.SetZero()
-
-		c[i] = p
-	}
-
-	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
-	cC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&c[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.matrix_vec_mod_mult_bls12_377(aC, bC, cC, nElementsC, deviceIdC)
-}
-
-/*
- * Utils
- */
-
-func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
-	if len(*slice) <= BASE_SIZE {
-		limbs := [BASE_SIZE]uint32{}
-		copy(limbs[:len(*slice)], *slice)
-		return limbs
-	}
-
-	panic("slice has too many elements")
-}
--- a/goicicle/curves/bls12377/g1_test.go
+++ b/goicicle/curves/bls12377/g1_test.go
@@ -1,198 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"encoding/binary"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewFieldBLS12_377One(t *testing.T) {
-	var oneField G1BaseField
-	oneField.SetOne()
-
-	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, oneField.S, rawOneField)
-}
-
-func TestNewFieldBLS12_377Zero(t *testing.T) {
-	var zeroField G1BaseField
-	zeroField.SetZero()
-
-	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, zeroField.S, rawZeroField)
-}
-
-func TestFieldBLS12_377ToBytesLe(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
-	for i, v := range p.X.S {
-		binary.LittleEndian.PutUint32(expected[i*4:], v)
-	}
-
-	assert.Equal(t, p.X.ToBytesLe(), expected)
-	assert.Equal(t, len(p.X.ToBytesLe()), 32)
-}
-
-func TestNewPointBLS12_377Zero(t *testing.T) {
-	var pointZero G1ProjectivePoint
-	pointZero.SetZero()
-
-	var baseOne G1BaseField
-	baseOne.SetOne()
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, pointZero.X, zeroSanity)
-	assert.Equal(t, pointZero.Y, baseOne)
-	assert.Equal(t, pointZero.Z, zeroSanity)
-}
-
-func TestFromProjectiveToAffine(t *testing.T) {
-	var projective G1ProjectivePoint
-	var affine G1PointAffine
-
-	projective.Random()
-
-	affine.FromProjective(&projective)
-	var projective2 G1ProjectivePoint
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestBLS12_377Eq(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-	var p2 G1ProjectivePoint
-	p2.Random()
-
-	assert.Equal(t, p1.Eq(&p1), true)
-	assert.Equal(t, p1.Eq(&p2), false)
-}
-
-func TestBLS12_377StripZ(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-
-	p2ZLess := p1.StripZ()
-
-	assert.IsType(t, G1PointAffine{}, *p2ZLess)
-	assert.Equal(t, p1.X, p2ZLess.X)
-	assert.Equal(t, p1.Y, p2ZLess.Y)
-}
-
-func TestPointBLS12_377fromLimbs(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	x := p.X.Limbs()
-	y := p.Y.Limbs()
-	z := p.Z.Limbs()
-
-	xSlice := x[:]
-	ySlice := y[:]
-	zSlice := z[:]
-
-	var pFromLimbs G1ProjectivePoint
-	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
-
-	assert.Equal(t, pFromLimbs, p)
-}
-
-func TestNewPointAffineNoInfinityBLS12_377Zero(t *testing.T) {
-	var zeroP G1PointAffine
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, zeroP.X, zeroSanity)
-	assert.Equal(t, zeroP.Y, zeroSanity)
-}
-
-func TestPointAffineNoInfinityBLS12_377FromLimbs(t *testing.T) {
-	// Initialize your test values
-	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
-	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
-	xSlice := x[:]
-	ySlice := y[:]
-
-	// Execute your function
-	var result G1PointAffine
-	result.FromLimbs(&xSlice, &ySlice)
-
-	var xBase G1BaseField
-	var yBase G1BaseField
-	xBase.FromLimbs(x)
-	yBase.FromLimbs(y)
-
-	// Define your expected result
-	expected := G1PointAffine{
-		X: xBase,
-		Y: yBase,
-	}
-
-	// Test if result is as expected
-	assert.Equal(t, expected, result)
-}
-
-func TestGetFixedLimbs(t *testing.T) {
-	t.Run("case of valid input of length less than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of valid input of length 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of empty input", func(t *testing.T) {
-		slice := []uint32{}
-		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of input length greater than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
-
-		defer func() {
-			if r := recover(); r == nil {
-				t.Errorf("the code did not panic")
-			}
-		}()
-
-		GetFixedLimbs(&slice)
-	})
-}
--- a/goicicle/curves/bls12377/g2.go
+++ b/goicicle/curves/bls12377/g2.go
@@ -1,102 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"encoding/binary"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-// G2 extension field
-
-type G2Element [6]uint64
-
-type ExtentionField struct {
-	A0, A1 G2Element
-}
-
-type G2PointAffine struct {
-	X, Y ExtentionField
-}
-
-type G2Point struct {
-	X, Y, Z ExtentionField
-}
-
-func (p *G2Point) Random() *G2Point {
-	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
-	C.random_g2_projective_bls12_377(outC)
-
-	return p
-}
-
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_bls12_377(out, in)
-
-	return p
-}
-
-func (p *G2Point) Eq(pCompare *G2Point) bool {
-	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It's your responsibility to ensure that the types are compatible.
-	pC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it's fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_g2_bls12_377(pC, pCompareC))
-}
-
-func (f *G2Element) ToBytesLe() []byte {
-	var bytes []byte
-	for _, val := range f {
-		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
-		binary.LittleEndian.PutUint64(buf, val)
-		bytes = append(bytes, buf...)
-	}
-	return bytes
-}
-
-func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
-	out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective))
-
-	C.g2_projective_to_affine_bls12_377(out, in)
-
-	return p
-}
-
-func (p *G2Point) IsOnCurve() bool {
-	// Directly copy memory from the C struct to the Go struct
-	point := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
-	res := C.g2_projective_is_on_curve_bls12_377(point)
-
-	return bool(res)
-}
--- a/goicicle/curves/bls12377/g2_test.go
+++ b/goicicle/curves/bls12377/g2_test.go
@@ -1,79 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"fmt"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestG2Eqg2(t *testing.T) {
-	var point G2Point
-
-	point.Random()
-
-	assert.True(t, point.Eq(&point))
-}
-
-func TestG2FromProjectiveToAffine(t *testing.T) {
-	var projective G2Point
-	projective.Random()
-
-	var affine G2PointAffine
-	affine.FromProjective(&projective)
-
-	var projective2 G2Point
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestG2Eqg2NotEqual(t *testing.T) {
-	var point G2Point
-	point.Random()
-
-	var point2 G2Point
-	point2.Random()
-
-	assert.False(t, point.Eq(&point2))
-}
-
-func TestG2ToBytes(t *testing.T) {
-	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
-	bytes := element.ToBytesLe()
-
-	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
-}
-
-func TestG2ShouldConvertToProjective(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var pointProjective G2Point
-	pointProjective.Random()
-
-	var pointAffine G2PointAffine
-	pointAffine.FromProjective(&pointProjective)
-
-	var proj G2Point
-	proj.FromAffine(&pointAffine)
-
-	assert.True(t, proj.IsOnCurve())
-	assert.True(t, pointProjective.Eq(&proj))
-}
--- a/goicicle/curves/bls12377/include/msm.h
+++ b/goicicle/curves/bls12377/include/msm.h
@@ -1,98 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdbool.h>
-// msm.h
-
-#ifndef _BLS12_377_MSM_H
-#define _BLS12_377_MSM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BLS12_377 projective and affine structs
-typedef struct BLS12_377_projective_t BLS12_377_projective_t;
-typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
-typedef struct BLS12_377_affine_t BLS12_377_affine_t;
-typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
-typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
-typedef cudaStream_t CudaStream_t;
-
-int msm_cuda_bls12_377(
-  BLS12_377_projective_t* out, BLS12_377_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
-
-int msm_batch_cuda_bls12_377(
-  BLS12_377_projective_t* out,
-  BLS12_377_affine_t* points,
-  BLS12_377_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_scalar_t* d_scalars,
-  BLS12_377_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_scalar_t* d_scalars,
-  BLS12_377_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id);
-
-int msm_g2_cuda_bls12_377(
-  BLS12_377_g2_projective_t* out,
-  BLS12_377_g2_affine_t* points,
-  BLS12_377_scalar_t* scalars,
-  size_t count,
-  size_t device_id);
-int msm_batch_g2_cuda_bls12_377(
-  BLS12_377_g2_projective_t* out,
-  BLS12_377_g2_affine_t* points,
-  BLS12_377_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-int commit_g2_cuda_bls12_377(
-  BLS12_377_g2_projective_t* d_out,
-  BLS12_377_scalar_t* d_scalars,
-  BLS12_377_g2_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-int commit_batch_g2_cuda_bls12_377(
-  BLS12_377_g2_projective_t* d_out,
-  BLS12_377_scalar_t* d_scalars,
-  BLS12_377_g2_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id,
-  cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_377_MSM_H */
--- a/goicicle/curves/bls12377/include/ntt.h
+++ b/goicicle/curves/bls12377/include/ntt.h
@@ -1,195 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ntt.h
-
-#ifndef _BLS12_377_NTT_H
-#define _BLS12_377_NTT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BLS12_377 projective and affine structs
-typedef struct BLS12_377_projective_t BLS12_377_projective_t;
-typedef struct BLS12_377_affine_t BLS12_377_affine_t;
-typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
-
-typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
-typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
-
-int ntt_cuda_bls12_377(BLS12_377_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ntt_batch_cuda_bls12_377(
-  BLS12_377_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-int ecntt_cuda_bls12_377(BLS12_377_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bls12_377(
-  BLS12_377_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-BLS12_377_scalar_t*
-build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int interpolate_scalars_batch_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_batch_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_on_coset_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  BLS12_377_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_evaluations,
-  BLS12_377_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_377_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_batch_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_batch_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BLS12_377_scalar_t* coset_powers,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out,
-  BLS12_377_scalar_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_377_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BLS12_377_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_batch_cuda_bls12_377(
-  BLS12_377_projective_t* d_out,
-  BLS12_377_projective_t* d_coefficients,
-  BLS12_377_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_377_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int reverse_order_scalars_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bls12_377(
-  BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int reverse_order_points_cuda_bls12_377(BLS12_377_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bls12_377(
-  BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bls12_377(
-  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
-int to_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
-
-// points g1
-int to_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
-
-// points g2
-int to_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_377_NTT_H */
--- a/goicicle/curves/bls12377/include/projective.h
+++ b/goicicle/curves/bls12377/include/projective.h
@@ -1,50 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// projective.h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BLS12_377_projective_t BLS12_377_projective_t;
-typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
-typedef struct BLS12_377_affine_t BLS12_377_affine_t;
-typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
-typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
-
-bool projective_is_on_curve_bls12_377(BLS12_377_projective_t* point1);
-
-int random_scalar_bls12_377(BLS12_377_scalar_t* out);
-int random_projective_bls12_377(BLS12_377_projective_t* out);
-BLS12_377_projective_t* projective_zero_bls12_377();
-int projective_to_affine_bls12_377(BLS12_377_affine_t* out, BLS12_377_projective_t* point1);
-int projective_from_affine_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* point1);
-
-int random_g2_projective_bls12_377(BLS12_377_g2_projective_t* out);
-int g2_projective_to_affine_bls12_377(BLS12_377_g2_affine_t* out, BLS12_377_g2_projective_t* point1);
-int g2_projective_from_affine_bls12_377(BLS12_377_g2_projective_t* out, BLS12_377_g2_affine_t* point1);
-bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t* point1);
-
-bool eq_bls12_377(BLS12_377_projective_t* point1, BLS12_377_projective_t* point2);
-bool eq_g2_bls12_377(BLS12_377_g2_projective_t* point1, BLS12_377_g2_projective_t* point2);
-
-#ifdef __cplusplus
-}
-#endif
--- a/goicicle/curves/bls12377/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12377/include/ve_mod_mult.h
@@ -1,49 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ve_mod_mult.h
-
-#ifndef _BLS12_377_VEC_MULT_H
-#define _BLS12_377_VEC_MULT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BLS12_377_projective_t BLS12_377_projective_t;
-typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
-
-int32_t vec_mod_mult_point_bls12_377(
-  BLS12_377_projective_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bls12_377(
-  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bls12_377(
-  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bls12_377(
-  BLS12_377_scalar_t* matrix_flattened,
-  BLS12_377_scalar_t* input,
-  BLS12_377_scalar_t* output,
-  size_t n_elments,
-  size_t device_id);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_377_VEC_MULT_H */
--- a/goicicle/curves/bls12377/msm.go
+++ b/goicicle/curves/bls12377/msm.go
@@ -1,209 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
-// #include "msm.h"
-import "C"
-
-func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(out))
-	ret := C.msm_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_cuda_bls12_377 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(out))
-
-	ret := C.msm_g2_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_g2_cuda_bls12_377 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G1ProjectivePoint, batchSize)
-
-	for i := 0; i < len(out); i++ {
-		var p G1ProjectivePoint
-		p.SetZero()
-
-		out[i] = p
-	}
-
-	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G2Point, batchSize)
-
-	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_g2_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BLS12_377_projective_t)(d_out)
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_377_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_g2_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BLS12_377_projective_t)(d_out)
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_377_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.commit_batch_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.msm_batch_g2_cuda_bls12_377(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bls12377/msm_test.go
+++ b/goicicle/curves/bls12377/msm_test.go
@@ -1,360 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"fmt"
-	"math"
-	"testing"
-	"time"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-	"github.com/stretchr/testify/assert"
-)
-
-func GeneratePoints(count int) []G1PointAffine {
-	// Declare a slice of integers
-	var points []G1PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		var pointProjective G1ProjectivePoint
-		pointProjective.Random()
-
-		var pointAffine G1PointAffine
-		pointAffine.FromProjective(&pointProjective)
-
-		points = append(points, pointAffine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func GeneratePointsProj(count int) []G1ProjectivePoint {
-	// Declare a slice of integers
-	var points []G1ProjectivePoint
-	// Use a loop to populate the slice
-	for i := 0; i < count; i++ {
-		var p G1ProjectivePoint
-		p.Random()
-
-		points = append(points, p)
-	}
-
-	return points
-}
-
-func GenerateScalars(count int, skewed bool) []G1ScalarField {
-	// Declare a slice of integers
-	var scalars []G1ScalarField
-
-	var rand G1ScalarField
-	var zero G1ScalarField
-	var one G1ScalarField
-	var randLarge G1ScalarField
-
-	zero.SetZero()
-	one.SetOne()
-	randLarge.Random()
-
-	if skewed && count > 1_200_000 {
-		for i := 0; i < count-1_200_000; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-
-		for i := 0; i < 600_000; i++ {
-			scalars = append(scalars, randLarge)
-		}
-		for i := 0; i < 400_000; i++ {
-			scalars = append(scalars, zero)
-		}
-		for i := 0; i < 200_000; i++ {
-			scalars = append(scalars, one)
-		}
-	} else {
-		for i := 0; i < count; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-	}
-
-	return scalars[:count]
-}
-
-func TestMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G1ProjectivePoint)
-		startTime := time.Now()
-		_, e := Msm(out, points, scalars, 0) // non mont
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		assert.Equal(t, e, nil, "error should be nil")
-
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func TestCommitMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1<<v - 1
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := count * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := Commit(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G1ProjectivePoint, 1)
-		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.True(t, outHost[0].IsOnCurve())
-	}
-}
-
-func BenchmarkCommit(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := msmSize * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := msmSize * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
-
-				if e != 0 {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestBatchMSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GeneratePoints(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
-
-func BenchmarkMSM(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G1ProjectivePoint)
-				_, e := Msm(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-// G2
-func GenerateG2Points(count int) []G2PointAffine {
-	// Declare a slice of integers
-	var points []G2PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		fmt.Print() // this prevents the test from hanging. TODO: figure out why
-		var p G2Point
-		p.Random()
-		var affine G2PointAffine
-		affine.FromProjective(&p)
-
-		points = append(points, affine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func TestMsmG2BLS12_377(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G2Point)
-		_, e := MsmG2(out, points, scalars, 0)
-		assert.Equal(t, e, nil, "error should be nil")
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func BenchmarkMsmG2BLS12_377(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GenerateG2Points(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G2Point)
-				_, e := MsmG2(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestCommitG2MSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		var sizeCheckG2PointAffine G2PointAffine
-		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
-
-		var sizeCheckG2Point G2Point
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
-		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := CommitG2(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G2Point, 1)
-		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.Equal(t, len(outHost), 1)
-		result := outHost[0]
-
-		assert.True(t, result.IsOnCurve())
-	}
-}
-
-func TestBatchG2MSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GenerateG2Points(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
--- a/goicicle/curves/bls12377/ntt.go
+++ b/goicicle/curves/bls12377/ntt.go
@@ -1,222 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
-// #include "ntt.h"
-import "C"
-
-const (
-	NONE = 0
-	DIF  = 1
-	DIT  = 2
-)
-
-func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-
-	ret := C.ntt_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
-
-	return uint64(ret)
-}
-
-func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
-	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	isInverseC := C.bool(isInverse)
-	batchSizeC := C.uint32_t(batchSize)
-	deviceIdC := C.size_t(deviceId)
-
-	ret := C.ntt_batch_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
-	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-
-	ret := C.ecntt_cuda_bls12_377(valuesC, n, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
-	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-	batchSizeC := C.uint32_t(batchSize)
-
-	ret := C.ecntt_batch_cuda_bls12_377(valuesC, n, batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
-	domain_size := C.uint32_t(d_size)
-	logn := C.uint32_t(log_d_size)
-	is_inverse := C.bool(inverse)
-
-	dp := C.build_domain_cuda_bls12_377(domain_size, logn, is_inverse, 0, 0)
-
-	if dp == nil {
-		err = errors.New("nullptr returned from generating twiddles")
-		return unsafe.Pointer(nil), err
-	}
-
-	return unsafe.Pointer(dp), nil
-}
-
-// Reverses d_scalars in-place
-func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	lenC := C.int(len)
-	if success := C.reverse_order_scalars_cuda_bls12_377(scalarsC, lenC, 0, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
-	size_d := size * 32
-	dp, err := goicicle.CudaMalloc(size_d)
-
-	if err != nil {
-		return nil
-	}
-
-	d_out := (*C.BLS12_377_scalar_t)(dp)
-	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
-	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
-	cosetPowersC := (*C.BLS12_377_scalar_t)(cosetPowers)
-	sizeC := C.uint(size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.interpolate_scalars_on_coset_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
-	} else {
-		ret = C.interpolate_scalars_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
-	}
-	if ret != 0 {
-		fmt.Print("error interpolating")
-	}
-
-	return unsafe.Pointer(d_out)
-}
-
-func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
-	scalars_outC := (*C.BLS12_377_scalar_t)(scalars_out)
-	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
-	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
-	coset_powersC := (*C.BLS12_377_scalar_t)(coset_powers)
-	sizeC := C.uint(scalars_size)
-	twiddlesC_size := C.uint(twiddles_size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.evaluate_scalars_on_coset_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
-	} else {
-		ret = C.evaluate_scalars_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
-	}
-
-	if ret != 0 {
-		fmt.Print("error interpolating")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
-	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.add_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error adding scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
-	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.sub_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error subtracting scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.to_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.from_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BLS12_377_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_g2_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
--- a/goicicle/curves/bls12377/ntt_test.go
+++ b/goicicle/curves/bls12377/ntt_test.go
@@ -1,148 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-import (
-	"fmt"
-	"github.com/stretchr/testify/assert"
-	"reflect"
-	"testing"
-)
-
-func TestNttBLS12_377Batch(t *testing.T) {
-	count := 1 << 20
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	NttBatch(&nttResult, false, count, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBLS12_377CompareToGnarkDIF(t *testing.T) {
-	count := 1 << 2
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestINttBLS12_377CompareToGnarkDIT(t *testing.T) {
-	count := 1 << 3
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, true, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBLS12_377(t *testing.T) {
-	count := 1 << 3
-
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	inttResult := make([]G1ScalarField, len(nttResult))
-	copy(inttResult, nttResult)
-
-	assert.Equal(t, inttResult, nttResult)
-	Ntt(&inttResult, true, 0)
-	assert.Equal(t, inttResult, scalars)
-}
-
-func TestNttBatchBLS12_377(t *testing.T) {
-	count := 1 << 5
-	batches := 4
-
-	scalars := GenerateScalars(count*batches, false)
-
-	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
-
-	for i := 0; i < batches; i++ {
-		start := i * count
-		end := (i + 1) * count
-		batch := make([]G1ScalarField, len(scalars[start:end]))
-		copy(batch, scalars[start:end])
-		scalarVecOfVec = append(scalarVecOfVec, batch)
-	}
-
-	nttBatchResult := make([]G1ScalarField, len(scalars))
-	copy(nttBatchResult, scalars)
-
-	NttBatch(&nttBatchResult, false, count, 0)
-
-	var nttResultVecOfVec [][]G1ScalarField
-
-	for i := 0; i < batches; i++ {
-		// Clone the slice
-		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
-		copy(clone, scalarVecOfVec[i])
-
-		// Add it to the result vector of vectors
-		nttResultVecOfVec = append(nttResultVecOfVec, clone)
-
-		// Call the ntt_bls12_377 function
-		Ntt(&nttResultVecOfVec[i], false, 0)
-	}
-
-	assert.NotEqual(t, nttBatchResult, scalars)
-
-	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
-	for i := 0; i < batches; i++ {
-		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
-			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
-		}
-	}
-}
-
-func BenchmarkNTT(b *testing.B) {
-	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
-
-	for _, logNTTSize := range LOG_NTT_SIZES {
-		nttSize := 1 << logNTTSize
-		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
-			scalars := GenerateScalars(nttSize, false)
-
-			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-			copy(nttResult, scalars)
-			for n := 0; n < b.N; n++ {
-				Ntt(&nttResult, false, 0)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bls12377/utils.go
+++ b/goicicle/curves/bls12377/utils.go
@@ -1,38 +0,0 @@
-package bls12377
-
-import "encoding/binary"
-
-// Function to convert [8]uint32 to [4]uint64
-func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
-	var arr64 [4]uint64
-	for i := 0; i < len(arr32); i += 2 {
-		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
-	}
-	return arr64
-}
-
-func ConvertUint64ArrToUint32Arr4(arr64 [4]uint64) [8]uint32 {
-	var arr32 [8]uint32
-	for i, v := range arr64 {
-		b := make([]byte, 8)
-		binary.LittleEndian.PutUint64(b, v)
-
-		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
-		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
-	}
-
-	return arr32
-}
-
-func ConvertUint64ArrToUint32Arr6(arr64 [6]uint64) [12]uint32 {
-	var arr32 [12]uint32
-	for i, v := range arr64 {
-		b := make([]byte, 8)
-		binary.LittleEndian.PutUint64(b, v)
-
-		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
-		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
-	}
-
-	return arr32
-}
--- a/goicicle/curves/bls12377/vec_mod.go
+++ b/goicicle/curves/bls12377/vec_mod.go
@@ -1,42 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12377
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
-// #include "ve_mod_mult.h"
-import "C"
-import (
-	"fmt"
-	"unsafe"
-)
-
-func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
-	scalarVec1C := (*C.BLS12_377_scalar_t)(scalarVec1)
-	scalarVec2C := (*C.BLS12_377_scalar_t)(scalarVec2)
-	sizeC := C.size_t(size)
-
-	ret := C.vec_mod_mult_device_scalar_bls12_377(scalarVec1C, scalarVec2C, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error multiplying scalar vectors")
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bls12381/g1.go
+++ b/goicicle/curves/bls12381/g1.go
@@ -1,328 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"unsafe"
-
-	"encoding/binary"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-const SCALAR_SIZE = 8
-const BASE_SIZE = 12
-
-type G1ScalarField struct {
-	S [SCALAR_SIZE]uint32
-}
-
-type G1BaseField struct {
-	S [BASE_SIZE]uint32
-}
-
-/*
- * BaseField Constructors
- */
-
-func (f *G1BaseField) SetZero() *G1BaseField {
-	var S [BASE_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1BaseField) SetOne() *G1BaseField {
-	var S [BASE_SIZE]uint32
-
-	S[0] = 1
-
-	f.S = S
-	return f
-}
-
-func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
-	out := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_381_affine_t)(unsafe.Pointer(affine))
-
-	C.projective_from_affine_bls12_381(out, in)
-
-	return p
-}
-
-func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
-	copy(f.S[:], limbs[:])
-
-	return f
-}
-
-/*
- * BaseField methods
- */
-
-func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1BaseField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * ScalarField methods
- */
-
-func (p *G1ScalarField) Random() *G1ScalarField {
-	outC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(p))
-	C.random_scalar_bls12_381(outC)
-
-	return p
-}
-
-func (f *G1ScalarField) SetZero() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1ScalarField) SetOne() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	S[0] = 1
-	f.S = S
-
-	return f
-}
-
-func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
-	for i, v := range a.S {
-		if b.S[i] != v {
-			return false
-		}
-	}
-	return true
-}
-
-/*
- * ScalarField methods
- */
-
-func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1ScalarField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * PointBLS12_381
- */
-
-type G1ProjectivePoint struct {
-	X, Y, Z G1BaseField
-}
-
-func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
-	var yOne G1BaseField
-	yOne.SetOne()
-
-	var xZero G1BaseField
-	xZero.SetZero()
-
-	var zZero G1BaseField
-	zZero.SetZero()
-
-	f.X = xZero
-	f.Y = yOne
-	f.Z = zZero
-
-	return f
-}
-
-func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
-	// Cast *PointBLS12_381 to *C.BLS12_381_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It'S your responsibility to ensure that the types are compatible.
-	pC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BLS12_381_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it'S fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_bls12_381(pC, pCompareC))
-}
-
-func (p *G1ProjectivePoint) IsOnCurve() bool {
-	point := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
-	res := C.projective_is_on_curve_bls12_381(point)
-
-	return bool(res)
-}
-
-func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
-	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
-	C.random_projective_bls12_381(outC)
-
-	return p
-}
-
-func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
-	return &G1PointAffine{
-		X: p.X,
-		Y: p.Y,
-	}
-}
-
-func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
-	var _x G1BaseField
-	var _y G1BaseField
-	var _z G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(x))
-	_y.FromLimbs(GetFixedLimbs(y))
-	_z.FromLimbs(GetFixedLimbs(z))
-
-	p.X = _x
-	p.Y = _y
-	p.Z = _z
-
-	return p
-}
-
-/*
- * PointAffineNoInfinityBLS12_381
- */
-
-type G1PointAffine struct {
-	X, Y G1BaseField
-}
-
-func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
-	in := (*C.BLS12_381_projective_t)(unsafe.Pointer(projective))
-	out := (*C.BLS12_381_affine_t)(unsafe.Pointer(p))
-
-	C.projective_to_affine_bls12_381(out, in)
-
-	return p
-}
-
-func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
-	var Z G1BaseField
-	Z.SetOne()
-
-	return &G1ProjectivePoint{
-		X: p.X,
-		Y: p.Y,
-		Z: Z,
-	}
-}
-
-func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
-	var _x G1BaseField
-	var _y G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(X))
-	_y.FromLimbs(GetFixedLimbs(Y))
-
-	p.X = _x
-	p.Y = _y
-
-	return p
-}
-
-/*
- * Multiplication
- */
-
-func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	pointsC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&a[0]))
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_point_bls12_381(pointsC, scalarsC, nElementsC, deviceIdC)
-}
-
-func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_scalar_bls12_381(aC, bC, nElementsC, deviceIdC)
-}
-
-// Multiply a matrix by a scalar:
-//
-//	`a` - flattenned matrix;
-//	`b` - vector to multiply `a` by;
-func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	c := make([]G1ScalarField, len(b))
-	for i := range c {
-		var p G1ScalarField
-		p.SetZero()
-
-		c[i] = p
-	}
-
-	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
-	cC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&c[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.matrix_vec_mod_mult_bls12_381(aC, bC, cC, nElementsC, deviceIdC)
-}
-
-/*
- * Utils
- */
-
-func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
-	if len(*slice) <= BASE_SIZE {
-		limbs := [BASE_SIZE]uint32{}
-		copy(limbs[:len(*slice)], *slice)
-		return limbs
-	}
-
-	panic("slice has too many elements")
-}
--- a/goicicle/curves/bls12381/g1_test.go
+++ b/goicicle/curves/bls12381/g1_test.go
@@ -1,198 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"encoding/binary"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewFieldBLS12_381One(t *testing.T) {
-	var oneField G1BaseField
-	oneField.SetOne()
-
-	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, oneField.S, rawOneField)
-}
-
-func TestNewFieldBLS12_381Zero(t *testing.T) {
-	var zeroField G1BaseField
-	zeroField.SetZero()
-
-	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, zeroField.S, rawZeroField)
-}
-
-func TestFieldBLS12_381ToBytesLe(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
-	for i, v := range p.X.S {
-		binary.LittleEndian.PutUint32(expected[i*4:], v)
-	}
-
-	assert.Equal(t, p.X.ToBytesLe(), expected)
-	assert.Equal(t, len(p.X.ToBytesLe()), 32)
-}
-
-func TestNewPointBLS12_381Zero(t *testing.T) {
-	var pointZero G1ProjectivePoint
-	pointZero.SetZero()
-
-	var baseOne G1BaseField
-	baseOne.SetOne()
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, pointZero.X, zeroSanity)
-	assert.Equal(t, pointZero.Y, baseOne)
-	assert.Equal(t, pointZero.Z, zeroSanity)
-}
-
-func TestFromProjectiveToAffine(t *testing.T) {
-	var projective G1ProjectivePoint
-	var affine G1PointAffine
-
-	projective.Random()
-
-	affine.FromProjective(&projective)
-	var projective2 G1ProjectivePoint
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestBLS12_381Eq(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-	var p2 G1ProjectivePoint
-	p2.Random()
-
-	assert.Equal(t, p1.Eq(&p1), true)
-	assert.Equal(t, p1.Eq(&p2), false)
-}
-
-func TestBLS12_381StripZ(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-
-	p2ZLess := p1.StripZ()
-
-	assert.IsType(t, G1PointAffine{}, *p2ZLess)
-	assert.Equal(t, p1.X, p2ZLess.X)
-	assert.Equal(t, p1.Y, p2ZLess.Y)
-}
-
-func TestPointBLS12_381fromLimbs(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	x := p.X.Limbs()
-	y := p.Y.Limbs()
-	z := p.Z.Limbs()
-
-	xSlice := x[:]
-	ySlice := y[:]
-	zSlice := z[:]
-
-	var pFromLimbs G1ProjectivePoint
-	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
-
-	assert.Equal(t, pFromLimbs, p)
-}
-
-func TestNewPointAffineNoInfinityBLS12_381Zero(t *testing.T) {
-	var zeroP G1PointAffine
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, zeroP.X, zeroSanity)
-	assert.Equal(t, zeroP.Y, zeroSanity)
-}
-
-func TestPointAffineNoInfinityBLS12_381FromLimbs(t *testing.T) {
-	// Initialize your test values
-	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
-	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
-	xSlice := x[:]
-	ySlice := y[:]
-
-	// Execute your function
-	var result G1PointAffine
-	result.FromLimbs(&xSlice, &ySlice)
-
-	var xBase G1BaseField
-	var yBase G1BaseField
-	xBase.FromLimbs(x)
-	yBase.FromLimbs(y)
-
-	// Define your expected result
-	expected := G1PointAffine{
-		X: xBase,
-		Y: yBase,
-	}
-
-	// Test if result is as expected
-	assert.Equal(t, expected, result)
-}
-
-func TestGetFixedLimbs(t *testing.T) {
-	t.Run("case of valid input of length less than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of valid input of length 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of empty input", func(t *testing.T) {
-		slice := []uint32{}
-		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of input length greater than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
-
-		defer func() {
-			if r := recover(); r == nil {
-				t.Errorf("the code did not panic")
-			}
-		}()
-
-		GetFixedLimbs(&slice)
-	})
-}
--- a/goicicle/curves/bls12381/g2.go
+++ b/goicicle/curves/bls12381/g2.go
@@ -1,102 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"encoding/binary"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-// G2 extension field
-
-type G2Element [6]uint64
-
-type ExtentionField struct {
-	A0, A1 G2Element
-}
-
-type G2PointAffine struct {
-	X, Y ExtentionField
-}
-
-type G2Point struct {
-	X, Y, Z ExtentionField
-}
-
-func (p *G2Point) Random() *G2Point {
-	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
-	C.random_g2_projective_bls12_381(outC)
-
-	return p
-}
-
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_bls12_381(out, in)
-
-	return p
-}
-
-func (p *G2Point) Eq(pCompare *G2Point) bool {
-	// Cast *PointBLS12_381 to *C.BLS12_381_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It's your responsibility to ensure that the types are compatible.
-	pC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it's fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_g2_bls12_381(pC, pCompareC))
-}
-
-func (f *G2Element) ToBytesLe() []byte {
-	var bytes []byte
-	for _, val := range f {
-		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
-		binary.LittleEndian.PutUint64(buf, val)
-		bytes = append(bytes, buf...)
-	}
-	return bytes
-}
-
-func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
-	out := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(p))
-	in := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(projective))
-
-	C.g2_projective_to_affine_bls12_381(out, in)
-
-	return p
-}
-
-func (p *G2Point) IsOnCurve() bool {
-	// Directly copy memory from the C struct to the Go struct
-	point := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
-	res := C.g2_projective_is_on_curve_bls12_381(point)
-
-	return bool(res)
-}
--- a/goicicle/curves/bls12381/g2_test.go
+++ b/goicicle/curves/bls12381/g2_test.go
@@ -1,79 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"fmt"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestG2Eqg2(t *testing.T) {
-	var point G2Point
-
-	point.Random()
-
-	assert.True(t, point.Eq(&point))
-}
-
-func TestG2FromProjectiveToAffine(t *testing.T) {
-	var projective G2Point
-	projective.Random()
-
-	var affine G2PointAffine
-	affine.FromProjective(&projective)
-
-	var projective2 G2Point
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestG2Eqg2NotEqual(t *testing.T) {
-	var point G2Point
-	point.Random()
-
-	var point2 G2Point
-	point2.Random()
-
-	assert.False(t, point.Eq(&point2))
-}
-
-func TestG2ToBytes(t *testing.T) {
-	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
-	bytes := element.ToBytesLe()
-
-	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
-}
-
-func TestG2ShouldConvertToProjective(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var pointProjective G2Point
-	pointProjective.Random()
-
-	var pointAffine G2PointAffine
-	pointAffine.FromProjective(&pointProjective)
-
-	var proj G2Point
-	proj.FromAffine(&pointAffine)
-
-	assert.True(t, proj.IsOnCurve())
-	assert.True(t, pointProjective.Eq(&proj))
-}
--- a/goicicle/curves/bls12381/include/msm.h
+++ b/goicicle/curves/bls12381/include/msm.h
@@ -1,98 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdbool.h>
-// msm.h
-
-#ifndef _BLS12_381_MSM_H
-#define _BLS12_381_MSM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BLS12_381 projective and affine structs
-typedef struct BLS12_381_projective_t BLS12_381_projective_t;
-typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
-typedef struct BLS12_381_affine_t BLS12_381_affine_t;
-typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
-typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
-typedef cudaStream_t CudaStream_t;
-
-int msm_cuda_bls12_381(
-  BLS12_381_projective_t* out, BLS12_381_affine_t* points, BLS12_381_scalar_t* scalars, size_t count, size_t device_id);
-
-int msm_batch_cuda_bls12_381(
-  BLS12_381_projective_t* out,
-  BLS12_381_affine_t* points,
-  BLS12_381_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_scalar_t* d_scalars,
-  BLS12_381_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_scalar_t* d_scalars,
-  BLS12_381_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id);
-
-int msm_g2_cuda_bls12_381(
-  BLS12_381_g2_projective_t* out,
-  BLS12_381_g2_affine_t* points,
-  BLS12_381_scalar_t* scalars,
-  size_t count,
-  size_t device_id);
-int msm_batch_g2_cuda_bls12_381(
-  BLS12_381_g2_projective_t* out,
-  BLS12_381_g2_affine_t* points,
-  BLS12_381_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-int commit_g2_cuda_bls12_381(
-  BLS12_381_g2_projective_t* d_out,
-  BLS12_381_scalar_t* d_scalars,
-  BLS12_381_g2_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-int commit_batch_g2_cuda_bls12_381(
-  BLS12_381_g2_projective_t* d_out,
-  BLS12_381_scalar_t* d_scalars,
-  BLS12_381_g2_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id,
-  cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_381_MSM_H */
--- a/goicicle/curves/bls12381/include/ntt.h
+++ b/goicicle/curves/bls12381/include/ntt.h
@@ -1,195 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ntt.h
-
-#ifndef _BLS12_381_NTT_H
-#define _BLS12_381_NTT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BLS12_381 projective and affine structs
-typedef struct BLS12_381_projective_t BLS12_381_projective_t;
-typedef struct BLS12_381_affine_t BLS12_381_affine_t;
-typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
-
-typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
-typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
-
-int ntt_cuda_bls12_381(BLS12_381_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ntt_batch_cuda_bls12_381(
-  BLS12_381_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-int ecntt_cuda_bls12_381(BLS12_381_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bls12_381(
-  BLS12_381_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-BLS12_381_scalar_t*
-build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int interpolate_scalars_batch_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_batch_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_on_coset_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  BLS12_381_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_evaluations,
-  BLS12_381_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_381_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_batch_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_batch_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BLS12_381_scalar_t* coset_powers,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out,
-  BLS12_381_scalar_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_381_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BLS12_381_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_batch_cuda_bls12_381(
-  BLS12_381_projective_t* d_out,
-  BLS12_381_projective_t* d_coefficients,
-  BLS12_381_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BLS12_381_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int reverse_order_scalars_cuda_bls12_381(BLS12_381_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bls12_381(
-  BLS12_381_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int reverse_order_points_cuda_bls12_381(BLS12_381_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bls12_381(
-  BLS12_381_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bls12_381(
-  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
-int to_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);
-
-// points g1
-int to_montgomery_proj_points_cuda_bls12_381(BLS12_381_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_cuda_bls12_381(BLS12_381_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_cuda_bls12_381(BLS12_381_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_cuda_bls12_381(BLS12_381_affine_t* d_inout, unsigned n, size_t stream);
-
-// points g2
-int to_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381_g2_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381_g2_affine_t* d_inout, unsigned n, size_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_381_NTT_H */
--- a/goicicle/curves/bls12381/include/projective.h
+++ b/goicicle/curves/bls12381/include/projective.h
@@ -1,50 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// projective.h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BLS12_381_projective_t BLS12_381_projective_t;
-typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
-typedef struct BLS12_381_affine_t BLS12_381_affine_t;
-typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
-typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
-
-bool projective_is_on_curve_bls12_381(BLS12_381_projective_t* point1);
-
-int random_scalar_bls12_381(BLS12_381_scalar_t* out);
-int random_projective_bls12_381(BLS12_381_projective_t* out);
-BLS12_381_projective_t* projective_zero_bls12_381();
-int projective_to_affine_bls12_381(BLS12_381_affine_t* out, BLS12_381_projective_t* point1);
-int projective_from_affine_bls12_381(BLS12_381_projective_t* out, BLS12_381_affine_t* point1);
-
-int random_g2_projective_bls12_381(BLS12_381_g2_projective_t* out);
-int g2_projective_to_affine_bls12_381(BLS12_381_g2_affine_t* out, BLS12_381_g2_projective_t* point1);
-int g2_projective_from_affine_bls12_381(BLS12_381_g2_projective_t* out, BLS12_381_g2_affine_t* point1);
-bool g2_projective_is_on_curve_bls12_381(BLS12_381_g2_projective_t* point1);
-
-bool eq_bls12_381(BLS12_381_projective_t* point1, BLS12_381_projective_t* point2);
-bool eq_g2_bls12_381(BLS12_381_g2_projective_t* point1, BLS12_381_g2_projective_t* point2);
-
-#ifdef __cplusplus
-}
-#endif
--- a/goicicle/curves/bls12381/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12381/include/ve_mod_mult.h
@@ -1,49 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ve_mod_mult.h
-
-#ifndef _BLS12_381_VEC_MULT_H
-#define _BLS12_381_VEC_MULT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BLS12_381_projective_t BLS12_381_projective_t;
-typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
-
-int32_t vec_mod_mult_point_bls12_381(
-  BLS12_381_projective_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bls12_381(
-  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bls12_381(
-  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bls12_381(
-  BLS12_381_scalar_t* matrix_flattened,
-  BLS12_381_scalar_t* input,
-  BLS12_381_scalar_t* output,
-  size_t n_elments,
-  size_t device_id);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BLS12_381_VEC_MULT_H */
--- a/goicicle/curves/bls12381/msm.go
+++ b/goicicle/curves/bls12381/msm.go
@@ -1,209 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
-// #include "msm.h"
-import "C"
-
-func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BLS12_381_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(out))
-	ret := C.msm_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_cuda_bls12_381 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(out))
-
-	ret := C.msm_g2_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_g2_cuda_bls12_381 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G1ProjectivePoint, batchSize)
-
-	for i := 0; i < len(out); i++ {
-		var p G1ProjectivePoint
-		p.SetZero()
-
-		out[i] = p
-	}
-
-	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BLS12_381_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G2Point, batchSize)
-
-	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_g2_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BLS12_381_projective_t)(d_out)
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_381_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BLS12_381_g2_projective_t)(d_out)
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_g2_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BLS12_381_projective_t)(d_out)
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_381_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.commit_batch_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BLS12_381_g2_projective_t)(d_out)
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.msm_batch_g2_cuda_bls12_381(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bls12381/msm_test.go
+++ b/goicicle/curves/bls12381/msm_test.go
@@ -1,360 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"fmt"
-	"math"
-	"testing"
-	"time"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-	"github.com/stretchr/testify/assert"
-)
-
-func GeneratePoints(count int) []G1PointAffine {
-	// Declare a slice of integers
-	var points []G1PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		var pointProjective G1ProjectivePoint
-		pointProjective.Random()
-
-		var pointAffine G1PointAffine
-		pointAffine.FromProjective(&pointProjective)
-
-		points = append(points, pointAffine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func GeneratePointsProj(count int) []G1ProjectivePoint {
-	// Declare a slice of integers
-	var points []G1ProjectivePoint
-	// Use a loop to populate the slice
-	for i := 0; i < count; i++ {
-		var p G1ProjectivePoint
-		p.Random()
-
-		points = append(points, p)
-	}
-
-	return points
-}
-
-func GenerateScalars(count int, skewed bool) []G1ScalarField {
-	// Declare a slice of integers
-	var scalars []G1ScalarField
-
-	var rand G1ScalarField
-	var zero G1ScalarField
-	var one G1ScalarField
-	var randLarge G1ScalarField
-
-	zero.SetZero()
-	one.SetOne()
-	randLarge.Random()
-
-	if skewed && count > 1_200_000 {
-		for i := 0; i < count-1_200_000; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-
-		for i := 0; i < 600_000; i++ {
-			scalars = append(scalars, randLarge)
-		}
-		for i := 0; i < 400_000; i++ {
-			scalars = append(scalars, zero)
-		}
-		for i := 0; i < 200_000; i++ {
-			scalars = append(scalars, one)
-		}
-	} else {
-		for i := 0; i < count; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-	}
-
-	return scalars[:count]
-}
-
-func TestMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G1ProjectivePoint)
-		startTime := time.Now()
-		_, e := Msm(out, points, scalars, 0) // non mont
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		assert.Equal(t, e, nil, "error should be nil")
-
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func TestCommitMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1<<v - 1
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := count * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := Commit(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G1ProjectivePoint, 1)
-		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.True(t, outHost[0].IsOnCurve())
-	}
-}
-
-func BenchmarkCommit(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := msmSize * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := msmSize * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
-
-				if e != 0 {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestBatchMSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GeneratePoints(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBLS12_381 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
-
-func BenchmarkMSM(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G1ProjectivePoint)
-				_, e := Msm(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-// G2
-func GenerateG2Points(count int) []G2PointAffine {
-	// Declare a slice of integers
-	var points []G2PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		fmt.Print() // this prevents the test from hanging. TODO: figure out why
-		var p G2Point
-		p.Random()
-		var affine G2PointAffine
-		affine.FromProjective(&p)
-
-		points = append(points, affine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func TestMsmG2BLS12_381(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G2Point)
-		_, e := MsmG2(out, points, scalars, 0)
-		assert.Equal(t, e, nil, "error should be nil")
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func BenchmarkMsmG2BLS12_381(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GenerateG2Points(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G2Point)
-				_, e := MsmG2(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestCommitG2MSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		var sizeCheckG2PointAffine G2PointAffine
-		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
-
-		var sizeCheckG2Point G2Point
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
-		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := CommitG2(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G2Point, 1)
-		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.Equal(t, len(outHost), 1)
-		result := outHost[0]
-
-		assert.True(t, result.IsOnCurve())
-	}
-}
-
-func TestBatchG2MSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GenerateG2Points(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBLS12_381 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
--- a/goicicle/curves/bls12381/ntt.go
+++ b/goicicle/curves/bls12381/ntt.go
@@ -1,222 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
-// #include "ntt.h"
-import "C"
-
-const (
-	NONE = 0
-	DIF  = 1
-	DIT  = 2
-)
-
-func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-
-	ret := C.ntt_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
-
-	return uint64(ret)
-}
-
-func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
-	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	isInverseC := C.bool(isInverse)
-	batchSizeC := C.uint32_t(batchSize)
-	deviceIdC := C.size_t(deviceId)
-
-	ret := C.ntt_batch_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
-	valuesC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-
-	ret := C.ecntt_cuda_bls12_381(valuesC, n, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
-	valuesC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-	batchSizeC := C.uint32_t(batchSize)
-
-	ret := C.ecntt_batch_cuda_bls12_381(valuesC, n, batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
-	domain_size := C.uint32_t(d_size)
-	logn := C.uint32_t(log_d_size)
-	is_inverse := C.bool(inverse)
-
-	dp := C.build_domain_cuda_bls12_381(domain_size, logn, is_inverse, 0, 0)
-
-	if dp == nil {
-		err = errors.New("nullptr returned from generating twiddles")
-		return unsafe.Pointer(nil), err
-	}
-
-	return unsafe.Pointer(dp), nil
-}
-
-// Reverses d_scalars in-place
-func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	lenC := C.int(len)
-	if success := C.reverse_order_scalars_cuda_bls12_381(scalarsC, lenC, 0, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
-	size_d := size * 32
-	dp, err := goicicle.CudaMalloc(size_d)
-
-	if err != nil {
-		return nil
-	}
-
-	d_out := (*C.BLS12_381_scalar_t)(dp)
-	scalarsC := (*C.BLS12_381_scalar_t)(scalars)
-	twiddlesC := (*C.BLS12_381_scalar_t)(twiddles)
-	cosetPowersC := (*C.BLS12_381_scalar_t)(cosetPowers)
-	sizeC := C.uint(size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.interpolate_scalars_on_coset_cuda_bls12_381(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
-	} else {
-		ret = C.interpolate_scalars_cuda_bls12_381(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
-	}
-	if ret != 0 {
-		fmt.Print("error interpolating")
-	}
-
-	return unsafe.Pointer(d_out)
-}
-
-func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
-	scalars_outC := (*C.BLS12_381_scalar_t)(scalars_out)
-	scalarsC := (*C.BLS12_381_scalar_t)(scalars)
-	twiddlesC := (*C.BLS12_381_scalar_t)(twiddles)
-	coset_powersC := (*C.BLS12_381_scalar_t)(coset_powers)
-	sizeC := C.uint(scalars_size)
-	twiddlesC_size := C.uint(twiddles_size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.evaluate_scalars_on_coset_cuda_bls12_381(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
-	} else {
-		ret = C.evaluate_scalars_cuda_bls12_381(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
-	}
-
-	if ret != 0 {
-		fmt.Print("error interpolating")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BLS12_381_scalar_t)(in1_d)
-	in2_dC := (*C.BLS12_381_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.add_scalars_cuda_bls12_381(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error adding scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BLS12_381_scalar_t)(in1_d)
-	in2_dC := (*C.BLS12_381_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.sub_scalars_cuda_bls12_381(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error subtracting scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.to_montgomery_scalars_cuda_bls12_381(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.from_montgomery_scalars_cuda_bls12_381(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BLS12_381_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_cuda_bls12_381(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_g2_cuda_bls12_381(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
--- a/goicicle/curves/bls12381/ntt_test.go
+++ b/goicicle/curves/bls12381/ntt_test.go
@@ -1,148 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-import (
-	"fmt"
-	"github.com/stretchr/testify/assert"
-	"reflect"
-	"testing"
-)
-
-func TestNttBLS12_381Batch(t *testing.T) {
-	count := 1 << 20
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	NttBatch(&nttResult, false, count, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBLS12_381CompareToGnarkDIF(t *testing.T) {
-	count := 1 << 2
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestINttBLS12_381CompareToGnarkDIT(t *testing.T) {
-	count := 1 << 3
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, true, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBLS12_381(t *testing.T) {
-	count := 1 << 3
-
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	inttResult := make([]G1ScalarField, len(nttResult))
-	copy(inttResult, nttResult)
-
-	assert.Equal(t, inttResult, nttResult)
-	Ntt(&inttResult, true, 0)
-	assert.Equal(t, inttResult, scalars)
-}
-
-func TestNttBatchBLS12_381(t *testing.T) {
-	count := 1 << 5
-	batches := 4
-
-	scalars := GenerateScalars(count*batches, false)
-
-	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
-
-	for i := 0; i < batches; i++ {
-		start := i * count
-		end := (i + 1) * count
-		batch := make([]G1ScalarField, len(scalars[start:end]))
-		copy(batch, scalars[start:end])
-		scalarVecOfVec = append(scalarVecOfVec, batch)
-	}
-
-	nttBatchResult := make([]G1ScalarField, len(scalars))
-	copy(nttBatchResult, scalars)
-
-	NttBatch(&nttBatchResult, false, count, 0)
-
-	var nttResultVecOfVec [][]G1ScalarField
-
-	for i := 0; i < batches; i++ {
-		// Clone the slice
-		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
-		copy(clone, scalarVecOfVec[i])
-
-		// Add it to the result vector of vectors
-		nttResultVecOfVec = append(nttResultVecOfVec, clone)
-
-		// Call the ntt_bls12_381 function
-		Ntt(&nttResultVecOfVec[i], false, 0)
-	}
-
-	assert.NotEqual(t, nttBatchResult, scalars)
-
-	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
-	for i := 0; i < batches; i++ {
-		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
-			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
-		}
-	}
-}
-
-func BenchmarkNTT(b *testing.B) {
-	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
-
-	for _, logNTTSize := range LOG_NTT_SIZES {
-		nttSize := 1 << logNTTSize
-		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
-			scalars := GenerateScalars(nttSize, false)
-
-			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-			copy(nttResult, scalars)
-			for n := 0; n < b.N; n++ {
-				Ntt(&nttResult, false, 0)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bls12381/utils.go
+++ b/goicicle/curves/bls12381/utils.go
@@ -1,38 +0,0 @@
-package bls12381
-
-import "encoding/binary"
-
-// Function to convert [8]uint32 to [4]uint64
-func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
-	var arr64 [4]uint64
-	for i := 0; i < len(arr32); i += 2 {
-		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
-	}
-	return arr64
-}
-
-func ConvertUint64ArrToUint32Arr4(arr64 [4]uint64) [8]uint32 {
-	var arr32 [8]uint32
-	for i, v := range arr64 {
-		b := make([]byte, 8)
-		binary.LittleEndian.PutUint64(b, v)
-
-		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
-		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
-	}
-
-	return arr32
-}
-
-func ConvertUint64ArrToUint32Arr6(arr64 [6]uint64) [12]uint32 {
-	var arr32 [12]uint32
-	for i, v := range arr64 {
-		b := make([]byte, 8)
-		binary.LittleEndian.PutUint64(b, v)
-
-		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
-		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
-	}
-
-	return arr32
-}
--- a/goicicle/curves/bls12381/utils_test.go
+++ b/goicicle/curves/bls12381/utils_test.go
@@ -1,81 +0,0 @@
-package bls12381
-
-import (
-	"testing"
-)
-
-func TestConvertUint32ArrToUint64Arr(t *testing.T) {
-	testCases := []struct {
-		name  string
-		input [8]uint32
-		want  [4]uint64
-	}{
-		{
-			name:  "Test with incremental array",
-			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
-			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
-		},
-		{
-			name:  "Test with all zeros",
-			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
-			want:  [4]uint64{0, 0, 0, 0},
-		},
-		{
-			name:  "Test with maximum uint32 values",
-			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
-			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
-		},
-		{
-			name:  "Test with alternating min and max uint32 values",
-			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
-			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
-		},
-		{
-			name:  "Test with alternating max and min uint32 values",
-			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
-			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			got := ConvertUint32ArrToUint64Arr(tc.input)
-			if got != tc.want {
-				t.Errorf("got %v, want %v", got, tc.want)
-			}
-		})
-	}
-}
-
-func TestConvertUint64ArrToUint32Arr(t *testing.T) {
-	testCases := []struct {
-		name     string
-		input    [6]uint64
-		expected [12]uint32
-	}{
-		{
-			name:     "test one",
-			input:    [6]uint64{1, 2, 3, 4, 5, 6},
-			expected: [12]uint32{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0},
-		},
-		{
-			name:     "test two",
-			input:    [6]uint64{100, 200, 300, 400, 500, 600},
-			expected: [12]uint32{100, 0, 200, 0, 300, 0, 400, 0, 500, 0, 600, 0},
-		},
-		{
-			name:     "test three",
-			input:    [6]uint64{1000, 2000, 3000, 4000, 5000, 6000},
-			expected: [12]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0, 5000, 0, 6000, 0},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			got := ConvertUint64ArrToUint32Arr6(tc.input)
-			if got != tc.expected {
-				t.Errorf("got %v, want %v", got, tc.expected)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bls12381/vec_mod.go
+++ b/goicicle/curves/bls12381/vec_mod.go
@@ -1,42 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bls12381
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
-// #include "ve_mod_mult.h"
-import "C"
-import (
-	"fmt"
-	"unsafe"
-)
-
-func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
-	scalarVec1C := (*C.BLS12_381_scalar_t)(scalarVec1)
-	scalarVec2C := (*C.BLS12_381_scalar_t)(scalarVec2)
-	sizeC := C.size_t(size)
-
-	ret := C.vec_mod_mult_device_scalar_bls12_381(scalarVec1C, scalarVec2C, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error multiplying scalar vectors")
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bn254/g1.go
+++ b/goicicle/curves/bn254/g1.go
@@ -1,328 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"unsafe"
-
-	"encoding/binary"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-const SCALAR_SIZE = 8
-const BASE_SIZE = 8
-
-type G1ScalarField struct {
-	S [SCALAR_SIZE]uint32
-}
-
-type G1BaseField struct {
-	S [BASE_SIZE]uint32
-}
-
-/*
- * BaseField Constructors
- */
-
-func (f *G1BaseField) SetZero() *G1BaseField {
-	var S [BASE_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1BaseField) SetOne() *G1BaseField {
-	var S [BASE_SIZE]uint32
-
-	S[0] = 1
-
-	f.S = S
-	return f
-}
-
-func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
-	out := (*C.BN254_projective_t)(unsafe.Pointer(p))
-	in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
-
-	C.projective_from_affine_bn254(out, in)
-
-	return p
-}
-
-func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
-	copy(f.S[:], limbs[:])
-
-	return f
-}
-
-/*
- * BaseField methods
- */
-
-func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1BaseField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * ScalarField methods
- */
-
-func (p *G1ScalarField) Random() *G1ScalarField {
-	outC := (*C.BN254_scalar_t)(unsafe.Pointer(p))
-	C.random_scalar_bn254(outC)
-
-	return p
-}
-
-func (f *G1ScalarField) SetZero() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1ScalarField) SetOne() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	S[0] = 1
-	f.S = S
-
-	return f
-}
-
-func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
-	for i, v := range a.S {
-		if b.S[i] != v {
-			return false
-		}
-	}
-	return true
-}
-
-/*
- * ScalarField methods
- */
-
-func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1ScalarField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * PointBN254
- */
-
-type G1ProjectivePoint struct {
-	X, Y, Z G1BaseField
-}
-
-func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
-	var yOne G1BaseField
-	yOne.SetOne()
-
-	var xZero G1BaseField
-	xZero.SetZero()
-
-	var zZero G1BaseField
-	zZero.SetZero()
-
-	f.X = xZero
-	f.Y = yOne
-	f.Z = zZero
-
-	return f
-}
-
-func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
-	// Cast *PointBN254 to *C.BN254_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It'S your responsibility to ensure that the types are compatible.
-	pC := (*C.BN254_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BN254_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it'S fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_bn254(pC, pCompareC))
-}
-
-func (p *G1ProjectivePoint) IsOnCurve() bool {
-	point := (*C.BN254_projective_t)(unsafe.Pointer(p))
-	res := C.projective_is_on_curve_bn254(point)
-
-	return bool(res)
-}
-
-func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
-	outC := (*C.BN254_projective_t)(unsafe.Pointer(p))
-	C.random_projective_bn254(outC)
-
-	return p
-}
-
-func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
-	return &G1PointAffine{
-		X: p.X,
-		Y: p.Y,
-	}
-}
-
-func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
-	var _x G1BaseField
-	var _y G1BaseField
-	var _z G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(x))
-	_y.FromLimbs(GetFixedLimbs(y))
-	_z.FromLimbs(GetFixedLimbs(z))
-
-	p.X = _x
-	p.Y = _y
-	p.Z = _z
-
-	return p
-}
-
-/*
- * PointAffineNoInfinityBN254
- */
-
-type G1PointAffine struct {
-	X, Y G1BaseField
-}
-
-func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
-	in := (*C.BN254_projective_t)(unsafe.Pointer(projective))
-	out := (*C.BN254_affine_t)(unsafe.Pointer(p))
-
-	C.projective_to_affine_bn254(out, in)
-
-	return p
-}
-
-func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
-	var Z G1BaseField
-	Z.SetOne()
-
-	return &G1ProjectivePoint{
-		X: p.X,
-		Y: p.Y,
-		Z: Z,
-	}
-}
-
-func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
-	var _x G1BaseField
-	var _y G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(X))
-	_y.FromLimbs(GetFixedLimbs(Y))
-
-	p.X = _x
-	p.Y = _y
-
-	return p
-}
-
-/*
- * Multiplication
- */
-
-func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	pointsC := (*C.BN254_projective_t)(unsafe.Pointer(&a[0]))
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_point_bn254(pointsC, scalarsC, nElementsC, deviceIdC)
-}
-
-func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_scalar_bn254(aC, bC, nElementsC, deviceIdC)
-}
-
-// Multiply a matrix by a scalar:
-//
-//	`a` - flattenned matrix;
-//	`b` - vector to multiply `a` by;
-func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	c := make([]G1ScalarField, len(b))
-	for i := range c {
-		var p G1ScalarField
-		p.SetZero()
-
-		c[i] = p
-	}
-
-	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
-	cC := (*C.BN254_scalar_t)(unsafe.Pointer(&c[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.matrix_vec_mod_mult_bn254(aC, bC, cC, nElementsC, deviceIdC)
-}
-
-/*
- * Utils
- */
-
-func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
-	if len(*slice) <= BASE_SIZE {
-		limbs := [BASE_SIZE]uint32{}
-		copy(limbs[:len(*slice)], *slice)
-		return limbs
-	}
-
-	panic("slice has too many elements")
-}
--- a/goicicle/curves/bn254/g1_test.go
+++ b/goicicle/curves/bn254/g1_test.go
@@ -1,198 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"encoding/binary"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewFieldBN254One(t *testing.T) {
-	var oneField G1BaseField
-	oneField.SetOne()
-
-	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, oneField.S, rawOneField)
-}
-
-func TestNewFieldBN254Zero(t *testing.T) {
-	var zeroField G1BaseField
-	zeroField.SetZero()
-
-	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, zeroField.S, rawZeroField)
-}
-
-func TestFieldBN254ToBytesLe(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
-	for i, v := range p.X.S {
-		binary.LittleEndian.PutUint32(expected[i*4:], v)
-	}
-
-	assert.Equal(t, p.X.ToBytesLe(), expected)
-	assert.Equal(t, len(p.X.ToBytesLe()), 32)
-}
-
-func TestNewPointBN254Zero(t *testing.T) {
-	var pointZero G1ProjectivePoint
-	pointZero.SetZero()
-
-	var baseOne G1BaseField
-	baseOne.SetOne()
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, pointZero.X, zeroSanity)
-	assert.Equal(t, pointZero.Y, baseOne)
-	assert.Equal(t, pointZero.Z, zeroSanity)
-}
-
-func TestFromProjectiveToAffine(t *testing.T) {
-	var projective G1ProjectivePoint
-	var affine G1PointAffine
-
-	projective.Random()
-
-	affine.FromProjective(&projective)
-	var projective2 G1ProjectivePoint
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestBN254Eq(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-	var p2 G1ProjectivePoint
-	p2.Random()
-
-	assert.Equal(t, p1.Eq(&p1), true)
-	assert.Equal(t, p1.Eq(&p2), false)
-}
-
-func TestBN254StripZ(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-
-	p2ZLess := p1.StripZ()
-
-	assert.IsType(t, G1PointAffine{}, *p2ZLess)
-	assert.Equal(t, p1.X, p2ZLess.X)
-	assert.Equal(t, p1.Y, p2ZLess.Y)
-}
-
-func TestPointBN254fromLimbs(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	x := p.X.Limbs()
-	y := p.Y.Limbs()
-	z := p.Z.Limbs()
-
-	xSlice := x[:]
-	ySlice := y[:]
-	zSlice := z[:]
-
-	var pFromLimbs G1ProjectivePoint
-	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
-
-	assert.Equal(t, pFromLimbs, p)
-}
-
-func TestNewPointAffineNoInfinityBN254Zero(t *testing.T) {
-	var zeroP G1PointAffine
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, zeroP.X, zeroSanity)
-	assert.Equal(t, zeroP.Y, zeroSanity)
-}
-
-func TestPointAffineNoInfinityBN254FromLimbs(t *testing.T) {
-	// Initialize your test values
-	x := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-	y := [8]uint32{9, 10, 11, 12, 13, 14, 15, 16}
-	xSlice := x[:]
-	ySlice := y[:]
-
-	// Execute your function
-	var result G1PointAffine
-	result.FromLimbs(&xSlice, &ySlice)
-
-	var xBase G1BaseField
-	var yBase G1BaseField
-	xBase.FromLimbs(x)
-	yBase.FromLimbs(y)
-
-	// Define your expected result
-	expected := G1PointAffine{
-		X: xBase,
-		Y: yBase,
-	}
-
-	// Test if result is as expected
-	assert.Equal(t, expected, result)
-}
-
-func TestGetFixedLimbs(t *testing.T) {
-	t.Run("case of valid input of length less than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of valid input of length 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of empty input", func(t *testing.T) {
-		slice := []uint32{}
-		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of input length greater than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
-
-		defer func() {
-			if r := recover(); r == nil {
-				t.Errorf("the code did not panic")
-			}
-		}()
-
-		GetFixedLimbs(&slice)
-	})
-}
--- a/goicicle/curves/bn254/g2.go
+++ b/goicicle/curves/bn254/g2.go
@@ -1,102 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"encoding/binary"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-// G2 extension field
-
-type G2Element [4]uint64
-
-type ExtentionField struct {
-	A0, A1 G2Element
-}
-
-type G2PointAffine struct {
-	X, Y ExtentionField
-}
-
-type G2Point struct {
-	X, Y, Z ExtentionField
-}
-
-func (p *G2Point) Random() *G2Point {
-	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
-	C.random_g2_projective_bn254(outC)
-
-	return p
-}
-
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.BN254_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_bn254(out, in)
-
-	return p
-}
-
-func (p *G2Point) Eq(pCompare *G2Point) bool {
-	// Cast *PointBN254 to *C.BN254_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It's your responsibility to ensure that the types are compatible.
-	pC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BN254_g2_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it's fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_g2_bn254(pC, pCompareC))
-}
-
-func (f *G2Element) ToBytesLe() []byte {
-	var bytes []byte
-	for _, val := range f {
-		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
-		binary.LittleEndian.PutUint64(buf, val)
-		bytes = append(bytes, buf...)
-	}
-	return bytes
-}
-
-func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
-	out := (*C.BN254_g2_affine_t)(unsafe.Pointer(p))
-	in := (*C.BN254_g2_projective_t)(unsafe.Pointer(projective))
-
-	C.g2_projective_to_affine_bn254(out, in)
-
-	return p
-}
-
-func (p *G2Point) IsOnCurve() bool {
-	// Directly copy memory from the C struct to the Go struct
-	point := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
-	res := C.g2_projective_is_on_curve_bn254(point)
-
-	return bool(res)
-}
--- a/goicicle/curves/bn254/g2_test.go
+++ b/goicicle/curves/bn254/g2_test.go
@@ -1,79 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"fmt"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestG2Eqg2(t *testing.T) {
-	var point G2Point
-
-	point.Random()
-
-	assert.True(t, point.Eq(&point))
-}
-
-func TestG2FromProjectiveToAffine(t *testing.T) {
-	var projective G2Point
-	projective.Random()
-
-	var affine G2PointAffine
-	affine.FromProjective(&projective)
-
-	var projective2 G2Point
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestG2Eqg2NotEqual(t *testing.T) {
-	var point G2Point
-	point.Random()
-
-	var point2 G2Point
-	point2.Random()
-
-	assert.False(t, point.Eq(&point2))
-}
-
-func TestG2ToBytes(t *testing.T) {
-	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
-	bytes := element.ToBytesLe()
-
-	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
-}
-
-func TestG2ShouldConvertToProjective(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var pointProjective G2Point
-	pointProjective.Random()
-
-	var pointAffine G2PointAffine
-	pointAffine.FromProjective(&pointProjective)
-
-	var proj G2Point
-	proj.FromAffine(&pointAffine)
-
-	assert.True(t, proj.IsOnCurve())
-	assert.True(t, pointProjective.Eq(&proj))
-}
--- a/goicicle/curves/bn254/include/msm.h
+++ b/goicicle/curves/bn254/include/msm.h
@@ -1,94 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdbool.h>
-// msm.h
-
-#ifndef _BN254_MSM_H
-#define _BN254_MSM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BN254 projective and affine structs
-typedef struct BN254_projective_t BN254_projective_t;
-typedef struct BN254_g2_projective_t BN254_g2_projective_t;
-typedef struct BN254_affine_t BN254_affine_t;
-typedef struct BN254_g2_affine_t BN254_g2_affine_t;
-typedef struct BN254_scalar_t BN254_scalar_t;
-typedef cudaStream_t CudaStream_t;
-
-int msm_cuda_bn254(
-  BN254_projective_t* out, BN254_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
-
-int msm_batch_cuda_bn254(
-  BN254_projective_t* out,
-  BN254_affine_t* points,
-  BN254_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_scalar_t* d_scalars,
-  BN254_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_scalar_t* d_scalars,
-  BN254_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id);
-
-int msm_g2_cuda_bn254(
-  BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
-int msm_batch_g2_cuda_bn254(
-  BN254_g2_projective_t* out,
-  BN254_g2_affine_t* points,
-  BN254_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-int commit_g2_cuda_bn254(
-  BN254_g2_projective_t* d_out,
-  BN254_scalar_t* d_scalars,
-  BN254_g2_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-int commit_batch_g2_cuda_bn254(
-  BN254_g2_projective_t* d_out,
-  BN254_scalar_t* d_scalars,
-  BN254_g2_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id,
-  cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BN254_MSM_H */
--- a/goicicle/curves/bn254/include/ntt.h
+++ b/goicicle/curves/bn254/include/ntt.h
@@ -1,193 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ntt.h
-
-#ifndef _BN254_NTT_H
-#define _BN254_NTT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BN254 projective and affine structs
-typedef struct BN254_projective_t BN254_projective_t;
-typedef struct BN254_affine_t BN254_affine_t;
-typedef struct BN254_scalar_t BN254_scalar_t;
-
-typedef struct BN254_g2_projective_t BN254_g2_projective_t;
-typedef struct BN254_g2_affine_t BN254_g2_affine_t;
-
-int ntt_cuda_bn254(BN254_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ntt_batch_cuda_bn254(BN254_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-int ecntt_cuda_bn254(BN254_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bn254(
-  BN254_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-BN254_scalar_t*
-build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-int interpolate_scalars_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int interpolate_scalars_batch_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_batch_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_on_coset_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  BN254_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_evaluations,
-  BN254_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  BN254_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_batch_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_batch_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BN254_scalar_t* coset_powers,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bn254(
-  BN254_scalar_t* d_out,
-  BN254_scalar_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BN254_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BN254_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_batch_cuda_bn254(
-  BN254_projective_t* d_out,
-  BN254_projective_t* d_coefficients,
-  BN254_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BN254_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bn254(
-  BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bn254(
-  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bn254(
-  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
-int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
-
-// points g1
-int to_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
-
-// points g2
-int to_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BN254_NTT_H */
--- a/goicicle/curves/bn254/include/projective.h
+++ b/goicicle/curves/bn254/include/projective.h
@@ -1,50 +0,0 @@
-
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// projective.h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BN254_projective_t BN254_projective_t;
-typedef struct BN254_g2_projective_t BN254_g2_projective_t;
-typedef struct BN254_affine_t BN254_affine_t;
-typedef struct BN254_g2_affine_t BN254_g2_affine_t;
-typedef struct BN254_scalar_t BN254_scalar_t;
-
-bool projective_is_on_curve_bn254(BN254_projective_t* point1);
-
-int random_scalar_bn254(BN254_scalar_t* out);
-int random_projective_bn254(BN254_projective_t* out);
-BN254_projective_t* projective_zero_bn254();
-int projective_to_affine_bn254(BN254_affine_t* out, BN254_projective_t* point1);
-int projective_from_affine_bn254(BN254_projective_t* out, BN254_affine_t* point1);
-
-int random_g2_projective_bn254(BN254_g2_projective_t* out);
-int g2_projective_to_affine_bn254(BN254_g2_affine_t* out, BN254_g2_projective_t* point1);
-int g2_projective_from_affine_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* point1);
-bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t* point1);
-
-bool eq_bn254(BN254_projective_t* point1, BN254_projective_t* point2);
-bool eq_g2_bn254(BN254_g2_projective_t* point1, BN254_g2_projective_t* point2);
-
-#ifdef __cplusplus
-}
-#endif
--- a/goicicle/curves/bn254/msm.go
+++ b/goicicle/curves/bn254/msm.go
@@ -1,209 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
-// #include "msm.h"
-import "C"
-
-func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BN254_projective_t)(unsafe.Pointer(out))
-	ret := C.msm_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_cuda_bn254 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(out))
-
-	ret := C.msm_g2_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_g2_cuda_bn254 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G1ProjectivePoint, batchSize)
-
-	for i := 0; i < len(out); i++ {
-		var p G1ProjectivePoint
-		p.SetZero()
-
-		out[i] = p
-	}
-
-	outC := (*C.BN254_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G2Point, batchSize)
-
-	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_g2_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BN254_projective_t)(d_out)
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	pointsC := (*C.BN254_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_cuda_bn254(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BN254_g2_projective_t)(d_out)
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	pointsC := (*C.BN254_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_g2_cuda_bn254(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BN254_projective_t)(d_out)
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	pointsC := (*C.BN254_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.commit_batch_cuda_bn254(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BN254_g2_projective_t)(d_out)
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	pointsC := (*C.BN254_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.msm_batch_g2_cuda_bn254(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bn254/msm_test.go
+++ b/goicicle/curves/bn254/msm_test.go
@@ -1,360 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"fmt"
-	"math"
-	"testing"
-	"time"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-	"github.com/stretchr/testify/assert"
-)
-
-func GeneratePoints(count int) []G1PointAffine {
-	// Declare a slice of integers
-	var points []G1PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		var pointProjective G1ProjectivePoint
-		pointProjective.Random()
-
-		var pointAffine G1PointAffine
-		pointAffine.FromProjective(&pointProjective)
-
-		points = append(points, pointAffine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func GeneratePointsProj(count int) []G1ProjectivePoint {
-	// Declare a slice of integers
-	var points []G1ProjectivePoint
-	// Use a loop to populate the slice
-	for i := 0; i < count; i++ {
-		var p G1ProjectivePoint
-		p.Random()
-
-		points = append(points, p)
-	}
-
-	return points
-}
-
-func GenerateScalars(count int, skewed bool) []G1ScalarField {
-	// Declare a slice of integers
-	var scalars []G1ScalarField
-
-	var rand G1ScalarField
-	var zero G1ScalarField
-	var one G1ScalarField
-	var randLarge G1ScalarField
-
-	zero.SetZero()
-	one.SetOne()
-	randLarge.Random()
-
-	if skewed && count > 1_200_000 {
-		for i := 0; i < count-1_200_000; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-
-		for i := 0; i < 600_000; i++ {
-			scalars = append(scalars, randLarge)
-		}
-		for i := 0; i < 400_000; i++ {
-			scalars = append(scalars, zero)
-		}
-		for i := 0; i < 200_000; i++ {
-			scalars = append(scalars, one)
-		}
-	} else {
-		for i := 0; i < count; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-	}
-
-	return scalars[:count]
-}
-
-func TestMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G1ProjectivePoint)
-		startTime := time.Now()
-		_, e := Msm(out, points, scalars, 0) // non mont
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		assert.Equal(t, e, nil, "error should be nil")
-
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func TestCommitMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1<<v - 1
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := count * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := Commit(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G1ProjectivePoint, 1)
-		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.True(t, outHost[0].IsOnCurve())
-	}
-}
-
-func BenchmarkCommit(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := msmSize * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := msmSize * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
-
-				if e != 0 {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestBatchMSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GeneratePoints(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBN254 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
-
-func BenchmarkMSM(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G1ProjectivePoint)
-				_, e := Msm(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-// G2
-func GenerateG2Points(count int) []G2PointAffine {
-	// Declare a slice of integers
-	var points []G2PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		fmt.Print() // this prevents the test from hanging. TODO: figure out why
-		var p G2Point
-		p.Random()
-		var affine G2PointAffine
-		affine.FromProjective(&p)
-
-		points = append(points, affine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func TestMsmG2BN254(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G2Point)
-		_, e := MsmG2(out, points, scalars, 0)
-		assert.Equal(t, e, nil, "error should be nil")
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func BenchmarkMsmG2BN254(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GenerateG2Points(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G2Point)
-				_, e := MsmG2(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestCommitG2MSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		var sizeCheckG2PointAffine G2PointAffine
-		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
-
-		var sizeCheckG2Point G2Point
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
-		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := CommitG2(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G2Point, 1)
-		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.Equal(t, len(outHost), 1)
-		result := outHost[0]
-
-		assert.True(t, result.IsOnCurve())
-	}
-}
-
-func TestBatchG2MSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GenerateG2Points(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBN254 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
--- a/goicicle/curves/bn254/ntt.go
+++ b/goicicle/curves/bn254/ntt.go
@@ -1,222 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
-// #include "ntt.h"
-import "C"
-
-const (
-	NONE = 0
-	DIF  = 1
-	DIT  = 2
-)
-
-func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-
-	ret := C.ntt_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
-
-	return uint64(ret)
-}
-
-func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
-	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	isInverseC := C.bool(isInverse)
-	batchSizeC := C.uint32_t(batchSize)
-	deviceIdC := C.size_t(deviceId)
-
-	ret := C.ntt_batch_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
-	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-
-	ret := C.ecntt_cuda_bn254(valuesC, n, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
-	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-	batchSizeC := C.uint32_t(batchSize)
-
-	ret := C.ecntt_batch_cuda_bn254(valuesC, n, batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
-	domain_size := C.uint32_t(d_size)
-	logn := C.uint32_t(log_d_size)
-	is_inverse := C.bool(inverse)
-
-	dp := C.build_domain_cuda_bn254(domain_size, logn, is_inverse, 0, 0)
-
-	if dp == nil {
-		err = errors.New("nullptr returned from generating twiddles")
-		return unsafe.Pointer(nil), err
-	}
-
-	return unsafe.Pointer(dp), nil
-}
-
-// Reverses d_scalars in-place
-func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	lenC := C.int(len)
-	if success := C.reverse_order_scalars_cuda_bn254(scalarsC, lenC, 0, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
-	size_d := size * 32
-	dp, err := goicicle.CudaMalloc(size_d)
-
-	if err != nil {
-		return nil
-	}
-
-	d_out := (*C.BN254_scalar_t)(dp)
-	scalarsC := (*C.BN254_scalar_t)(scalars)
-	twiddlesC := (*C.BN254_scalar_t)(twiddles)
-	cosetPowersC := (*C.BN254_scalar_t)(cosetPowers)
-	sizeC := C.uint(size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.interpolate_scalars_on_coset_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
-	} else {
-		ret = C.interpolate_scalars_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
-	}
-	if ret != 0 {
-		fmt.Print("error interpolating")
-	}
-
-	return unsafe.Pointer(d_out)
-}
-
-func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
-	scalars_outC := (*C.BN254_scalar_t)(scalars_out)
-	scalarsC := (*C.BN254_scalar_t)(scalars)
-	twiddlesC := (*C.BN254_scalar_t)(twiddles)
-	coset_powersC := (*C.BN254_scalar_t)(coset_powers)
-	sizeC := C.uint(scalars_size)
-	twiddlesC_size := C.uint(twiddles_size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.evaluate_scalars_on_coset_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
-	} else {
-		ret = C.evaluate_scalars_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
-	}
-
-	if ret != 0 {
-		fmt.Print("error interpolating")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BN254_scalar_t)(in1_d)
-	in2_dC := (*C.BN254_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.add_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error adding scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BN254_scalar_t)(in1_d)
-	in2_dC := (*C.BN254_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.sub_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error subtracting scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.to_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BN254_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.from_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BN254_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_cuda_bn254(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BN254_g2_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_g2_cuda_bn254(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
--- a/goicicle/curves/bn254/ntt_test.go
+++ b/goicicle/curves/bn254/ntt_test.go
@@ -1,148 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-import (
-	"fmt"
-	"github.com/stretchr/testify/assert"
-	"reflect"
-	"testing"
-)
-
-func TestNttBN254Batch(t *testing.T) {
-	count := 1 << 20
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	NttBatch(&nttResult, false, count, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBN254CompareToGnarkDIF(t *testing.T) {
-	count := 1 << 2
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestINttBN254CompareToGnarkDIT(t *testing.T) {
-	count := 1 << 3
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, true, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBN254(t *testing.T) {
-	count := 1 << 3
-
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	inttResult := make([]G1ScalarField, len(nttResult))
-	copy(inttResult, nttResult)
-
-	assert.Equal(t, inttResult, nttResult)
-	Ntt(&inttResult, true, 0)
-	assert.Equal(t, inttResult, scalars)
-}
-
-func TestNttBatchBN254(t *testing.T) {
-	count := 1 << 5
-	batches := 4
-
-	scalars := GenerateScalars(count*batches, false)
-
-	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
-
-	for i := 0; i < batches; i++ {
-		start := i * count
-		end := (i + 1) * count
-		batch := make([]G1ScalarField, len(scalars[start:end]))
-		copy(batch, scalars[start:end])
-		scalarVecOfVec = append(scalarVecOfVec, batch)
-	}
-
-	nttBatchResult := make([]G1ScalarField, len(scalars))
-	copy(nttBatchResult, scalars)
-
-	NttBatch(&nttBatchResult, false, count, 0)
-
-	var nttResultVecOfVec [][]G1ScalarField
-
-	for i := 0; i < batches; i++ {
-		// Clone the slice
-		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
-		copy(clone, scalarVecOfVec[i])
-
-		// Add it to the result vector of vectors
-		nttResultVecOfVec = append(nttResultVecOfVec, clone)
-
-		// Call the ntt_bn254 function
-		Ntt(&nttResultVecOfVec[i], false, 0)
-	}
-
-	assert.NotEqual(t, nttBatchResult, scalars)
-
-	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
-	for i := 0; i < batches; i++ {
-		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
-			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
-		}
-	}
-}
-
-func BenchmarkNTT(b *testing.B) {
-	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
-
-	for _, logNTTSize := range LOG_NTT_SIZES {
-		nttSize := 1 << logNTTSize
-		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
-			scalars := GenerateScalars(nttSize, false)
-
-			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-			copy(nttResult, scalars)
-			for n := 0; n < b.N; n++ {
-				Ntt(&nttResult, false, 0)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bn254/utils.go
+++ b/goicicle/curves/bn254/utils.go
@@ -1,48 +0,0 @@
-package bn254
-
-import (
-	"encoding/binary"
-	"fmt"
-	"log"
-	"regexp"
-	"runtime"
-	"time"
-)
-
-// Function to convert [8]uint32 to [4]uint64
-func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
-	var arr64 [4]uint64
-	for i := 0; i < len(arr32); i += 2 {
-		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
-	}
-	return arr64
-}
-
-func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 {
-	var arr32 [8]uint32
-	for i, v := range arr64 {
-		b := make([]byte, 8)
-		binary.LittleEndian.PutUint64(b, v)
-
-		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
-		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
-	}
-
-	return arr32
-}
-
-func TimeTrack(start time.Time) {
-	elapsed := time.Since(start)
-
-	// Skip this function, and fetch the PC and file for its parent.
-	pc, _, _, _ := runtime.Caller(1)
-
-	// Retrieve a function object this functions parent.
-	funcObj := runtime.FuncForPC(pc)
-
-	// Regex to extract just the function name (and not the module path).
-	runtimeFunc := regexp.MustCompile(`^.*\.(.*)$`)
-	name := runtimeFunc.ReplaceAllString(funcObj.Name(), "$1")
-
-	log.Println(fmt.Sprintf("%s took %s", name, elapsed))
-}
--- a/goicicle/curves/bn254/utils_test.go
+++ b/goicicle/curves/bn254/utils_test.go
@@ -1,81 +0,0 @@
-package bn254
-
-import (
-	"testing"
-)
-
-func TestConvertUint32ArrToUint64Arr(t *testing.T) {
-	testCases := []struct {
-		name  string
-		input [8]uint32
-		want  [4]uint64
-	}{
-		{
-			name:  "Test with incremental array",
-			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
-			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
-		},
-		{
-			name:  "Test with all zeros",
-			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
-			want:  [4]uint64{0, 0, 0, 0},
-		},
-		{
-			name:  "Test with maximum uint32 values",
-			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
-			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
-		},
-		{
-			name:  "Test with alternating min and max uint32 values",
-			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
-			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
-		},
-		{
-			name:  "Test with alternating max and min uint32 values",
-			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
-			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			got := ConvertUint32ArrToUint64Arr(tc.input)
-			if got != tc.want {
-				t.Errorf("got %v, want %v", got, tc.want)
-			}
-		})
-	}
-}
-
-func TestConvertUint64ArrToUint32Arr(t *testing.T) {
-	testCases := []struct {
-		name     string
-		input    [4]uint64
-		expected [8]uint32
-	}{
-		{
-			name:     "test one",
-			input:    [4]uint64{1, 2, 3, 4},
-			expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0},
-		},
-		{
-			name:     "test two",
-			input:    [4]uint64{100, 200, 300, 400},
-			expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0},
-		},
-		{
-			name:     "test three",
-			input:    [4]uint64{1000, 2000, 3000, 4000},
-			expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			got := ConvertUint64ArrToUint32Arr(tc.input)
-			if got != tc.expected {
-				t.Errorf("got %v, want %v", got, tc.expected)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bn254/vec_mod.go
+++ b/goicicle/curves/bn254/vec_mod.go
@@ -1,42 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bn254
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
-// #include "ve_mod_mult.h"
-import "C"
-import (
-	"fmt"
-	"unsafe"
-)
-
-func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
-	scalarVec1C := (*C.BN254_scalar_t)(scalarVec1)
-	scalarVec2C := (*C.BN254_scalar_t)(scalarVec2)
-	sizeC := C.size_t(size)
-
-	ret := C.vec_mod_mult_device_scalar_bn254(scalarVec1C, scalarVec2C, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error multiplying scalar vectors")
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bw6761/g1.go
+++ b/goicicle/curves/bw6761/g1.go
@@ -1,328 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"unsafe"
-
-	"encoding/binary"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-const SCALAR_SIZE = 12
-const BASE_SIZE = 24
-
-type G1ScalarField struct {
-	S [SCALAR_SIZE]uint32
-}
-
-type G1BaseField struct {
-	S [BASE_SIZE]uint32
-}
-
-/*
- * BaseField Constructors
- */
-
-func (f *G1BaseField) SetZero() *G1BaseField {
-	var S [BASE_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1BaseField) SetOne() *G1BaseField {
-	var S [BASE_SIZE]uint32
-
-	S[0] = 1
-
-	f.S = S
-	return f
-}
-
-func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
-	out := (*C.BW6761_projective_t)(unsafe.Pointer(p))
-	in := (*C.BW6761_affine_t)(unsafe.Pointer(affine))
-
-	C.projective_from_affine_bw6_761(out, in)
-
-	return p
-}
-
-func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
-	copy(f.S[:], limbs[:])
-
-	return f
-}
-
-/*
- * BaseField methods
- */
-
-func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1BaseField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * ScalarField methods
- */
-
-func (p *G1ScalarField) Random() *G1ScalarField {
-	outC := (*C.BW6761_scalar_t)(unsafe.Pointer(p))
-	C.random_scalar_bw6_761(outC)
-
-	return p
-}
-
-func (f *G1ScalarField) SetZero() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1ScalarField) SetOne() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	S[0] = 1
-	f.S = S
-
-	return f
-}
-
-func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
-	for i, v := range a.S {
-		if b.S[i] != v {
-			return false
-		}
-	}
-	return true
-}
-
-/*
- * ScalarField methods
- */
-
-func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1ScalarField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * PointBW6761
- */
-
-type G1ProjectivePoint struct {
-	X, Y, Z G1BaseField
-}
-
-func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
-	var yOne G1BaseField
-	yOne.SetOne()
-
-	var xZero G1BaseField
-	xZero.SetZero()
-
-	var zZero G1BaseField
-	zZero.SetZero()
-
-	f.X = xZero
-	f.Y = yOne
-	f.Z = zZero
-
-	return f
-}
-
-func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
-	// Cast *PointBW6761 to *C.BW6761_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It'S your responsibility to ensure that the types are compatible.
-	pC := (*C.BW6761_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BW6761_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it'S fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_bw6_761(pC, pCompareC))
-}
-
-func (p *G1ProjectivePoint) IsOnCurve() bool {
-	point := (*C.BW6761_projective_t)(unsafe.Pointer(p))
-	res := C.projective_is_on_curve_bw6_761(point)
-
-	return bool(res)
-}
-
-func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
-	outC := (*C.BW6761_projective_t)(unsafe.Pointer(p))
-	C.random_projective_bw6_761(outC)
-
-	return p
-}
-
-func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
-	return &G1PointAffine{
-		X: p.X,
-		Y: p.Y,
-	}
-}
-
-func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
-	var _x G1BaseField
-	var _y G1BaseField
-	var _z G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(x))
-	_y.FromLimbs(GetFixedLimbs(y))
-	_z.FromLimbs(GetFixedLimbs(z))
-
-	p.X = _x
-	p.Y = _y
-	p.Z = _z
-
-	return p
-}
-
-/*
- * PointAffineNoInfinityBW6761
- */
-
-type G1PointAffine struct {
-	X, Y G1BaseField
-}
-
-func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
-	in := (*C.BW6761_projective_t)(unsafe.Pointer(projective))
-	out := (*C.BW6761_affine_t)(unsafe.Pointer(p))
-
-	C.projective_to_affine_bw6_761(out, in)
-
-	return p
-}
-
-func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
-	var Z G1BaseField
-	Z.SetOne()
-
-	return &G1ProjectivePoint{
-		X: p.X,
-		Y: p.Y,
-		Z: Z,
-	}
-}
-
-func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
-	var _x G1BaseField
-	var _y G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(X))
-	_y.FromLimbs(GetFixedLimbs(Y))
-
-	p.X = _x
-	p.Y = _y
-
-	return p
-}
-
-/*
- * Multiplication
- */
-
-func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	pointsC := (*C.BW6761_projective_t)(unsafe.Pointer(&a[0]))
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_point_bw6_761(pointsC, scalarsC, nElementsC, deviceIdC)
-}
-
-func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_scalar_bw6_761(aC, bC, nElementsC, deviceIdC)
-}
-
-// Multiply a matrix by a scalar:
-//
-//	`a` - flattenned matrix;
-//	`b` - vector to multiply `a` by;
-func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	c := make([]G1ScalarField, len(b))
-	for i := range c {
-		var p G1ScalarField
-		p.SetZero()
-
-		c[i] = p
-	}
-
-	aC := (*C.BW6761_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.BW6761_scalar_t)(unsafe.Pointer(&b[0]))
-	cC := (*C.BW6761_scalar_t)(unsafe.Pointer(&c[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.matrix_vec_mod_mult_bw6_761(aC, bC, cC, nElementsC, deviceIdC)
-}
-
-/*
- * Utils
- */
-
-func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
-	if len(*slice) <= BASE_SIZE {
-		limbs := [BASE_SIZE]uint32{}
-		copy(limbs[:len(*slice)], *slice)
-		return limbs
-	}
-
-	panic("slice has too many elements")
-}
--- a/goicicle/curves/bw6761/g1_test.go
+++ b/goicicle/curves/bw6761/g1_test.go
@@ -1,212 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"encoding/binary"
-	"fmt"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func generateUint32Array(length int, isZero bool) []uint32 {
-	arr := make([]uint32, length)
-	for i := 0; i < length; i++ {
-		if isZero {
-			arr[i] = 0x0
-		} else {
-			arr[i] = uint32(i + 1) // You can modify this line to fill the array as needed
-		}
-	}
-	return arr
-}
-
-func TestNewFieldBW6761One(t *testing.T) {
-	var oneField G1BaseField
-	oneField.SetOne()
-
-	rawOneField := [24]uint32([24]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, oneField.S, rawOneField)
-}
-
-func TestNewFieldBW6761Zero(t *testing.T) {
-	var zeroField G1BaseField
-	zeroField.SetZero()
-
-	rawZeroField := [24]uint32([24]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, zeroField.S, rawZeroField)
-}
-
-func TestFieldBW6761ToBytesLe(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
-	for i, v := range p.X.S {
-		binary.LittleEndian.PutUint32(expected[i*4:], v)
-	}
-
-	assert.Equal(t, p.X.ToBytesLe(), expected)
-	assert.Equal(t, len(p.X.ToBytesLe()), 96)
-}
-
-func TestNewPointBW6761Zero(t *testing.T) {
-	var pointZero G1ProjectivePoint
-	pointZero.SetZero()
-
-	var baseOne G1BaseField
-	baseOne.SetOne()
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, pointZero.X, zeroSanity)
-	assert.Equal(t, pointZero.Y, baseOne)
-	assert.Equal(t, pointZero.Z, zeroSanity)
-}
-
-func TestFromProjectiveToAffine(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var projective G1ProjectivePoint
-	var affine G1PointAffine
-
-	projective.Random()
-
-	affine.FromProjective(&projective)
-	var projective2 G1ProjectivePoint
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestBW6761Eq(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-	var p2 G1ProjectivePoint
-	p2.Random()
-
-	assert.Equal(t, p1.Eq(&p1), true)
-	assert.Equal(t, p1.Eq(&p2), false)
-}
-
-func TestBW6761StripZ(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-
-	p2ZLess := p1.StripZ()
-
-	assert.IsType(t, G1PointAffine{}, *p2ZLess)
-	assert.Equal(t, p1.X, p2ZLess.X)
-	assert.Equal(t, p1.Y, p2ZLess.Y)
-}
-
-func TestPointBW6761fromLimbs(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	x := p.X.Limbs()
-	y := p.Y.Limbs()
-	z := p.Z.Limbs()
-
-	xSlice := x[:]
-	ySlice := y[:]
-	zSlice := z[:]
-
-	var pFromLimbs G1ProjectivePoint
-	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
-
-	assert.Equal(t, pFromLimbs, p)
-}
-
-func TestNewPointAffineNoInfinityBW6761Zero(t *testing.T) {
-	var zeroP G1PointAffine
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, zeroP.X, zeroSanity)
-	assert.Equal(t, zeroP.Y, zeroSanity)
-}
-
-func TestPointAffineNoInfinityBW6761FromLimbs(t *testing.T) {
-	// Initialize your test values
-	x := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}
-	y := [24]uint32{1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}
-	xSlice := x[:]
-	ySlice := y[:]
-
-	// Execute your function
-	var result G1PointAffine
-	result.FromLimbs(&xSlice, &ySlice)
-
-	var xBase G1BaseField
-	var yBase G1BaseField
-	xBase.FromLimbs(x)
-	yBase.FromLimbs(y)
-
-	// Define your expected result
-	expected := G1PointAffine{
-		X: xBase,
-		Y: yBase,
-	}
-
-	// Test if result is as expected
-	assert.Equal(t, expected, result)
-}
-
-func TestGetFixedLimbs(t *testing.T) {
-	t.Run("case of valid input of length less than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
-		expected := [24]uint32{1, 2, 3, 4, 5, 6, 7, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of valid input of length 24", func(t *testing.T) {
-		slice := generateUint32Array(24, false)
-		expected := [24]uint32(generateUint32Array(24, false))
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of empty input", func(t *testing.T) {
-		slice := []uint32{}
-		expected := [24]uint32(generateUint32Array(24, true))
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of input length greater than 24", func(t *testing.T) {
-		slice := generateUint32Array(25, false)
-
-		defer func() {
-			if r := recover(); r == nil {
-				t.Errorf("the code did not panic")
-			}
-		}()
-
-		GetFixedLimbs(&slice)
-	})
-}
--- a/goicicle/curves/bw6761/g2.go
+++ b/goicicle/curves/bw6761/g2.go
@@ -1,98 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"encoding/binary"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-// G2 extension field
-
-type G2Element [12]uint64
-
-type G2PointAffine struct {
-	X, Y G2Element
-}
-
-type G2Point struct {
-	X, Y, Z G2Element
-}
-
-func (p *G2Point) Random() *G2Point {
-	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
-	C.random_g2_projective_bw6_761(outC)
-
-	return p
-}
-
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.BW6761_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_bw6_761(out, in)
-
-	return p
-}
-
-func (p *G2Point) Eq(pCompare *G2Point) bool {
-	// Cast *PointBW6761 to *C.BW6761_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It's your responsibility to ensure that the types are compatible.
-	pC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it's fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_g2_bw6_761(pC, pCompareC))
-}
-
-func (f *G2Element) ToBytesLe() []byte {
-	var bytes []byte
-	for _, val := range f {
-		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
-		binary.LittleEndian.PutUint64(buf, val)
-		bytes = append(bytes, buf...)
-	}
-	return bytes
-}
-
-func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
-	out := (*C.BW6761_g2_affine_t)(unsafe.Pointer(p))
-	in := (*C.BW6761_g2_projective_t)(unsafe.Pointer(projective))
-
-	C.g2_projective_to_affine_bw6_761(out, in)
-
-	return p
-}
-
-func (p *G2Point) IsOnCurve() bool {
-	// Directly copy memory from the C struct to the Go struct
-	point := (*C.BW6761_g2_projective_t)(unsafe.Pointer(p))
-	res := C.g2_projective_is_on_curve_bw6_761(point)
-
-	return bool(res)
-}
--- a/goicicle/curves/bw6761/g2_test.go
+++ b/goicicle/curves/bw6761/g2_test.go
@@ -1,83 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"fmt"
-	"testing"
-	"unsafe"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestG2Eqg2(t *testing.T) {
-	var point G2Point
-
-	point.Random()
-
-	assert.True(t, point.Eq(&point))
-}
-
-func TestG2FromProjectiveToAffine(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var projective G2Point
-	projective.Random()
-
-	var affine G2PointAffine
-	affine.FromProjective(&projective)
-
-	var projective2 G2Point
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestG2Eqg2NotEqual(t *testing.T) {
-	var point G2Point
-	point.Random()
-
-	var point2 G2Point
-	point2.Random()
-
-	assert.False(t, point.Eq(&point2))
-}
-
-func TestG2ToBytes(t *testing.T) {
-	var point G2Point
-	var element G2Element
-	point.Random()
-	bytes := point.X.ToBytesLe()
-
-	assert.Equal(t, len(bytes), int(unsafe.Sizeof(element)))
-}
-
-func TestG2ShouldConvertToProjective(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var pointProjective G2Point
-	pointProjective.Random()
-
-	var pointAffine G2PointAffine
-	pointAffine.FromProjective(&pointProjective)
-
-	var proj G2Point
-	proj.FromAffine(&pointAffine)
-
-	assert.True(t, proj.IsOnCurve())
-	assert.True(t, pointProjective.Eq(&proj))
-}
--- a/goicicle/curves/bw6761/include/msm.h
+++ b/goicicle/curves/bw6761/include/msm.h
@@ -1,101 +0,0 @@
-
-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdbool.h>
-// msm.h
-
-#ifndef _BW6761_MSM_H
-#define _BW6761_MSM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BW6761 projective and affine structs
-typedef struct BW6761_projective_t BW6761_projective_t;
-typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
-typedef struct BW6761_affine_t BW6761_affine_t;
-typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
-typedef struct BW6761_scalar_t BW6761_scalar_t;
-typedef cudaStream_t CudaStream_t;
-
-int msm_cuda_bw6_761(
-  BW6761_projective_t* out, BW6761_affine_t* points, BW6761_scalar_t* scalars, size_t count, size_t device_id);
-
-int msm_batch_cuda_bw6_761(
-  BW6761_projective_t* out,
-  BW6761_affine_t* points,
-  BW6761_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_scalar_t* d_scalars,
-  BW6761_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_scalar_t* d_scalars,
-  BW6761_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id);
-
-int msm_g2_cuda_bw6_761(
-  BW6761_g2_projective_t* out,
-  BW6761_g2_affine_t* points,
-  BW6761_scalar_t* scalars,
-  size_t count,
-  size_t device_id);
-
-int msm_batch_g2_cuda_bw6_761(
-  BW6761_g2_projective_t* out,
-  BW6761_g2_affine_t* points,
-  BW6761_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_g2_cuda_bw6_761(
-  BW6761_g2_projective_t* d_out,
-  BW6761_scalar_t* d_scalars,
-  BW6761_g2_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_g2_cuda_bw6_761(
-  BW6761_g2_projective_t* d_out,
-  BW6761_scalar_t* d_scalars,
-  BW6761_g2_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id,
-  cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BW6761_MSM_H */
--- a/goicicle/curves/bw6761/include/ntt.h
+++ b/goicicle/curves/bw6761/include/ntt.h
@@ -1,198 +0,0 @@
-
-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// ntt.h
-
-#ifndef _BW6761_NTT_H
-#define _BW6761_NTT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of BW6761 projective and affine structs
-typedef struct BW6761_projective_t BW6761_projective_t;
-typedef struct BW6761_affine_t BW6761_affine_t;
-typedef struct BW6761_scalar_t BW6761_scalar_t;
-
-typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
-typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
-
-int ntt_cuda_bw6_761(BW6761_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ntt_batch_cuda_bw6_761(
-  BW6761_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-int ecntt_cuda_bw6_761(BW6761_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_bw6_761(
-  BW6761_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-BW6761_scalar_t* 
-build_domain_cuda_bw6_761(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-
-int interpolate_scalars_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int interpolate_scalars_batch_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_batch_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_on_coset_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  BW6761_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_evaluations,
-  BW6761_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  BW6761_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-
-int evaluate_scalars_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_batch_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_batch_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BW6761_scalar_t* coset_powers,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_bw6_761(
-  BW6761_scalar_t* d_out,
-  BW6761_scalar_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BW6761_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  BW6761_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_batch_cuda_bw6_761(
-  BW6761_projective_t* d_out,
-  BW6761_projective_t* d_coefficients,
-  BW6761_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  BW6761_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-
-int reverse_order_scalars_cuda_bw6_761(BW6761_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_bw6_761(
-  BW6761_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int reverse_order_points_cuda_bw6_761(BW6761_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_bw6_761(
-  BW6761_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_bw6_761(
-  BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_bw6_761(
-  BW6761_scalar_t* d_out, BW6761_scalar_t* d_in1, BW6761_scalar_t* d_in2, unsigned n, size_t stream);
-int to_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_scalars_cuda_bw6_761(BW6761_scalar_t* d_inout, unsigned n, size_t stream);
-
-// points g1
-int to_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_cuda_bw6_761(BW6761_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_cuda_bw6_761(BW6761_affine_t* d_inout, unsigned n, size_t stream);
-
-// points g2
-int to_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_g2_cuda_bw6_761(BW6761_g2_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_g2_cuda_bw6_761(BW6761_g2_affine_t* d_inout, unsigned n, size_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BW6761_NTT_H */
--- a/goicicle/curves/bw6761/include/projective.h
+++ b/goicicle/curves/bw6761/include/projective.h
@@ -1,50 +0,0 @@
-
-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <cuda.h>
-#include <stdbool.h>
-// projective.h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BW6761_projective_t BW6761_projective_t;
-typedef struct BW6761_g2_projective_t BW6761_g2_projective_t;
-typedef struct BW6761_affine_t BW6761_affine_t;
-typedef struct BW6761_g2_affine_t BW6761_g2_affine_t;
-typedef struct BW6761_scalar_t BW6761_scalar_t;
-
-bool projective_is_on_curve_bw6_761(BW6761_projective_t* point1);
-
-int random_scalar_bw6_761(BW6761_scalar_t* out); 
-int random_projective_bw6_761(BW6761_projective_t* out);
-BW6761_projective_t* projective_zero_bw6_761();
-int projective_to_affine_bw6_761(BW6761_affine_t* out, BW6761_projective_t* point1);
-int projective_from_affine_bw6_761(BW6761_projective_t* out, BW6761_affine_t* point1);
-
-int random_g2_projective_bw6_761(BW6761_g2_projective_t* out);
-int g2_projective_to_affine_bw6_761(BW6761_g2_affine_t* out, BW6761_g2_projective_t* point1);
-int g2_projective_from_affine_bw6_761(BW6761_g2_projective_t* out, BW6761_g2_affine_t* point1);
-bool g2_projective_is_on_curve_bw6_761(BW6761_g2_projective_t* point1);
-
-bool eq_bw6_761(BW6761_projective_t* point1, BW6761_projective_t* point2);
-bool eq_g2_bw6_761(BW6761_g2_projective_t* point1, BW6761_g2_projective_t* point2);
-
-#ifdef __cplusplus
-}
-#endif
--- a/goicicle/curves/bw6761/include/ve_mod_mult.h
+++ b/goicicle/curves/bw6761/include/ve_mod_mult.h
@@ -1,49 +0,0 @@
-
-	// Copyright 2023 Ingonyama
-	//
-	// Licensed under the Apache License, Version 2.0 (the "License");
-	// you may not use this file except in compliance with the License.
-	// You may obtain a copy of the License at
-	//
-	//     http://www.apache.org/licenses/LICENSE-2.0
-	//
-	// Unless required by applicable law or agreed to in writing, software
-	// distributed under the License is distributed on an "AS IS" BASIS,
-	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	// See the License for the specific language governing permissions and
-	// limitations under the License.
-	
-// Code generated by Ingonyama DO NOT EDIT
-
-#include <stdbool.h>
-#include <cuda.h>
-// ve_mod_mult.h
-
-#ifndef _BW6761_VEC_MULT_H
-#define _BW6761_VEC_MULT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct BW6761_projective_t BW6761_projective_t;
-typedef struct BW6761_scalar_t BW6761_scalar_t;
-
-int32_t vec_mod_mult_point_bw6_761(
-  BW6761_projective_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_bw6_761(
-  BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_bw6_761(
-  BW6761_scalar_t* inout, BW6761_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_bw6_761(
-  BW6761_scalar_t* matrix_flattened,
-  BW6761_scalar_t* input,
-  BW6761_scalar_t* output,
-  size_t n_elments,
-  size_t device_id);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _BW6761_VEC_MULT_H */
--- a/goicicle/curves/bw6761/msm.go
+++ b/goicicle/curves/bw6761/msm.go
@@ -1,209 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
-// #include "msm.h"
-import "C"
-
-func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BW6761_projective_t)(unsafe.Pointer(out))
-	ret := C.msm_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_cuda_bw6_761 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(out))
-
-	ret := C.msm_g2_cuda_bw6_761(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_g2_cuda_bw6_761 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G1ProjectivePoint, batchSize)
-
-	for i := 0; i < len(out); i++ {
-		var p G1ProjectivePoint
-		p.SetZero()
-
-		out[i] = p
-	}
-
-	outC := (*C.BW6761_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BW6761_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G2Point, batchSize)
-
-	outC := (*C.BW6761_g2_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.BW6761_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_g2_cuda_bw6_761(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_bw6_761 returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BW6761_projective_t)(d_out)
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	pointsC := (*C.BW6761_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.BW6761_g2_projective_t)(d_out)
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	pointsC := (*C.BW6761_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_g2_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BW6761_projective_t)(d_out)
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	pointsC := (*C.BW6761_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.commit_batch_cuda_bw6_761(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.BW6761_g2_projective_t)(d_out)
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	pointsC := (*C.BW6761_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.msm_batch_g2_cuda_bw6_761(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/curves/bw6761/msm_test.go
+++ b/goicicle/curves/bw6761/msm_test.go
@@ -1,367 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"fmt"
-	"math"
-	"testing"
-	"time"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-	"github.com/stretchr/testify/assert"
-)
-
-func GeneratePoints(count int) []G1PointAffine {
-	// Declare a slice of integers
-	var points []G1PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		var pointProjective G1ProjectivePoint
-		pointProjective.Random()
-
-		var pointAffine G1PointAffine
-		pointAffine.FromProjective(&pointProjective)
-
-		points = append(points, pointAffine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func GeneratePointsProj(count int) []G1ProjectivePoint {
-	// Declare a slice of integers
-	var points []G1ProjectivePoint
-	// Use a loop to populate the slice
-	for i := 0; i < count; i++ {
-		var p G1ProjectivePoint
-		p.Random()
-
-		points = append(points, p)
-	}
-
-	return points
-}
-
-func GenerateScalars(count int, skewed bool) []G1ScalarField {
-	// Declare a slice of integers
-	var scalars []G1ScalarField
-
-	var rand G1ScalarField
-	var zero G1ScalarField
-	var one G1ScalarField
-	var randLarge G1ScalarField
-
-	zero.SetZero()
-	one.SetOne()
-	randLarge.Random()
-
-	if skewed && count > 1_200_000 {
-		for i := 0; i < count-1_200_000; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-
-		for i := 0; i < 600_000; i++ {
-			scalars = append(scalars, randLarge)
-		}
-		for i := 0; i < 400_000; i++ {
-			scalars = append(scalars, zero)
-		}
-		for i := 0; i < 200_000; i++ {
-			scalars = append(scalars, one)
-		}
-	} else {
-		for i := 0; i < count; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-	}
-
-	return scalars[:count]
-}
-
-func TestMSM(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G1ProjectivePoint)
-		startTime := time.Now()
-		_, e := Msm(out, points, scalars, 0) // non mont
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		assert.Equal(t, e, nil, "error should be nil")
-
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func TestCommitMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1<<v - 1
-		fmt.Print("Started generating points and scalars\n")
-		points := GeneratePoints(count)
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating points and scalars\n")
-
-		var sizeOutD G1ProjectivePoint
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeOutD)))
-
-		var sizePoints G1PointAffine
-		pointsBytes := count * int(unsafe.Sizeof(sizePoints))
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		var sizeScalar G1ScalarField
-		scalarBytes := count * int(unsafe.Sizeof(sizeScalar))
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := Commit(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G1ProjectivePoint, 1)
-		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, int(unsafe.Sizeof(sizeOutD)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.True(t, outHost[0].IsOnCurve())
-	}
-}
-
-func BenchmarkCommit(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := msmSize * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := msmSize * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
-
-				assert.Equal(b, e, 0, "error should be 0")
-				outHost := make([]G1ProjectivePoint, 1)
-				goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 288)
-				assert.True(b, outHost[0].IsOnCurve())
-				if e != 0 {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestBatchMSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GeneratePoints(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBW6761 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
-
-func BenchmarkMSM(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G1ProjectivePoint)
-				_, e := Msm(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-// G2
-func GenerateG2Points(count int) []G2PointAffine {
-	// Declare a slice of integers
-	var points []G2PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		fmt.Print() // this prevents the test from hanging. TODO: figure out why
-		var p G2Point
-		p.Random()
-		var affine G2PointAffine
-		affine.FromProjective(&p)
-
-		points = append(points, affine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func TestMsmG2BW6761(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G2Point)
-		_, e := MsmG2(out, points, scalars, 0)
-		assert.Equal(t, e, nil, "error should be nil")
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func BenchmarkMsmG2BW6761(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GenerateG2Points(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G2Point)
-				_, e := MsmG2(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestCommitG2MSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		var sizeCheckG2PointAffine G2PointAffine
-		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
-
-		var sizeCheckG2Point G2Point
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
-		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := CommitG2(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G2Point, 1)
-		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.Equal(t, len(outHost), 1)
-		result := outHost[0]
-
-		assert.True(t, result.IsOnCurve())
-	}
-}
-
-func TestBatchG2MSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GenerateG2Points(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatchBW6761 returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
--- a/goicicle/curves/bw6761/ntt.go
+++ b/goicicle/curves/bw6761/ntt.go
@@ -1,222 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
-// #include "ntt.h"
-import "C"
-
-const (
-	NONE = 0
-	DIF  = 1
-	DIT  = 2
-)
-
-func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-
-	ret := C.ntt_cuda_bw6_761(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
-
-	return uint64(ret)
-}
-
-func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
-	scalarsC := (*C.BW6761_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	isInverseC := C.bool(isInverse)
-	batchSizeC := C.uint32_t(batchSize)
-	deviceIdC := C.size_t(deviceId)
-
-	ret := C.ntt_batch_cuda_bw6_761(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
-	valuesC := (*C.BW6761_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-
-	ret := C.ecntt_cuda_bw6_761(valuesC, n, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
-	valuesC := (*C.BW6761_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-	batchSizeC := C.uint32_t(batchSize)
-
-	ret := C.ecntt_batch_cuda_bw6_761(valuesC, n, batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
-	domain_size := C.uint32_t(d_size)
-	logn := C.uint32_t(log_d_size)
-	is_inverse := C.bool(inverse)
-
-	dp := C.build_domain_cuda_bw6_761(domain_size, logn, is_inverse, 0, 0)
-
-	if dp == nil {
-		err = errors.New("nullptr returned from generating twiddles")
-		return unsafe.Pointer(nil), err
-	}
-
-	return unsafe.Pointer(dp), nil
-}
-
-// Reverses d_scalars in-place
-func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	lenC := C.int(len)
-	if success := C.reverse_order_scalars_cuda_bw6_761(scalarsC, lenC, 0, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
-	size_d := size * 48
-	dp, err := goicicle.CudaMalloc(size_d)
-
-	if err != nil {
-		return nil
-	}
-
-	d_out := (*C.BW6761_scalar_t)(dp)
-	scalarsC := (*C.BW6761_scalar_t)(scalars)
-	twiddlesC := (*C.BW6761_scalar_t)(twiddles)
-	cosetPowersC := (*C.BW6761_scalar_t)(cosetPowers)
-	sizeC := C.uint(size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.interpolate_scalars_on_coset_cuda_bw6_761(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
-	} else {
-		ret = C.interpolate_scalars_cuda_bw6_761(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
-	}
-	if ret != 0 {
-		fmt.Print("error interpolating")
-	}
-
-	return unsafe.Pointer(d_out)
-}
-
-func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
-	scalars_outC := (*C.BW6761_scalar_t)(scalars_out)
-	scalarsC := (*C.BW6761_scalar_t)(scalars)
-	twiddlesC := (*C.BW6761_scalar_t)(twiddles)
-	coset_powersC := (*C.BW6761_scalar_t)(coset_powers)
-	sizeC := C.uint(scalars_size)
-	twiddlesC_size := C.uint(twiddles_size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.evaluate_scalars_on_coset_cuda_bw6_761(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
-	} else {
-		ret = C.evaluate_scalars_cuda_bw6_761(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
-	}
-
-	if ret != 0 {
-		fmt.Print("error interpolating")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BW6761_scalar_t)(in1_d)
-	in2_dC := (*C.BW6761_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.add_scalars_cuda_bw6_761(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error adding scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.BW6761_scalar_t)(in1_d)
-	in2_dC := (*C.BW6761_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.sub_scalars_cuda_bw6_761(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error subtracting scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.to_montgomery_scalars_cuda_bw6_761(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.BW6761_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.from_montgomery_scalars_cuda_bw6_761(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BW6761_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_cuda_bw6_761(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.BW6761_g2_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_g2_cuda_bw6_761(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
--- a/goicicle/curves/bw6761/ntt_test.go
+++ b/goicicle/curves/bw6761/ntt_test.go
@@ -1,148 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-import (
-	"fmt"
-	"github.com/stretchr/testify/assert"
-	"reflect"
-	"testing"
-)
-
-func TestNttBW6761Batch(t *testing.T) {
-	count := 1 << 20
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	NttBatch(&nttResult, false, count, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBW6761CompareToGnarkDIF(t *testing.T) {
-	count := 1 << 2
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestINttBW6761CompareToGnarkDIT(t *testing.T) {
-	count := 1 << 3
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, true, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNttBW6761(t *testing.T) {
-	count := 1 << 3
-
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	inttResult := make([]G1ScalarField, len(nttResult))
-	copy(inttResult, nttResult)
-
-	assert.Equal(t, inttResult, nttResult)
-	Ntt(&inttResult, true, 0)
-	assert.Equal(t, inttResult, scalars)
-}
-
-func TestNttBatchBW6761(t *testing.T) {
-	count := 1 << 5
-	batches := 4
-
-	scalars := GenerateScalars(count*batches, false)
-
-	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
-
-	for i := 0; i < batches; i++ {
-		start := i * count
-		end := (i + 1) * count
-		batch := make([]G1ScalarField, len(scalars[start:end]))
-		copy(batch, scalars[start:end])
-		scalarVecOfVec = append(scalarVecOfVec, batch)
-	}
-
-	nttBatchResult := make([]G1ScalarField, len(scalars))
-	copy(nttBatchResult, scalars)
-
-	NttBatch(&nttBatchResult, false, count, 0)
-
-	var nttResultVecOfVec [][]G1ScalarField
-
-	for i := 0; i < batches; i++ {
-		// Clone the slice
-		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
-		copy(clone, scalarVecOfVec[i])
-
-		// Add it to the result vector of vectors
-		nttResultVecOfVec = append(nttResultVecOfVec, clone)
-
-		// Call the ntt_bw6_761 function
-		Ntt(&nttResultVecOfVec[i], false, 0)
-	}
-
-	assert.NotEqual(t, nttBatchResult, scalars)
-
-	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
-	for i := 0; i < batches; i++ {
-		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
-			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
-		}
-	}
-}
-
-func BenchmarkNTT(b *testing.B) {
-	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
-
-	for _, logNTTSize := range LOG_NTT_SIZES {
-		nttSize := 1 << logNTTSize
-		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
-			scalars := GenerateScalars(nttSize, false)
-
-			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-			copy(nttResult, scalars)
-			for n := 0; n < b.N; n++ {
-				Ntt(&nttResult, false, 0)
-			}
-		})
-	}
-}
--- a/goicicle/curves/bw6761/vec_mod.go
+++ b/goicicle/curves/bw6761/vec_mod.go
@@ -1,42 +0,0 @@
-// Copyright 2023 Ingonyama
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Code generated by Ingonyama DO NOT EDIT
-
-package bw6761
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbw6761
-// #include "ve_mod_mult.h"
-import "C"
-import (
-	"fmt"
-	"unsafe"
-)
-
-func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
-	scalarVec1C := (*C.BW6761_scalar_t)(scalarVec1)
-	scalarVec2C := (*C.BW6761_scalar_t)(scalarVec2)
-	sizeC := C.size_t(size)
-
-	ret := C.vec_mod_mult_device_scalar_bw6_761(scalarVec1C, scalarVec2C, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error multiplying scalar vectors")
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/goicicle.go
+++ b/goicicle/goicicle.go
@@ -1,49 +0,0 @@
-package goicicle
-
-// This file implements CUDA driver context management
-
-// #cgo CFLAGS: -I /usr/local/cuda/include
-// #cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart
-/*
-#include <cuda.h>
-#include <cuda_runtime.h>
-*/
-import "C"
-
-import (
-	"errors"
-	"unsafe"
-)
-
-func CudaMalloc(size int) (dp unsafe.Pointer, err error) {
-	var p C.void
-	dp = unsafe.Pointer(&p)
-	if err := C.cudaMalloc(&dp, C.size_t(size)); err != 0 {
-		return nil, errors.New("could not create memory space")
-	}
-	return dp, nil
-}
-
-func CudaFree(dp unsafe.Pointer) int {
-	if err := C.cudaFree(dp); err != 0 {
-		return -1
-	}
-	return 0
-}
-
-func CudaMemCpyHtoD[T any](dst_d unsafe.Pointer, src []T, size int) int {
-	src_c := unsafe.Pointer(&src[0])
-	if err := C.cudaMemcpy(dst_d, src_c, C.size_t(size), 1); err != 0 {
-		return -1
-	}
-	return 0
-}
-
-func CudaMemCpyDtoH[T any](dst []T, src_d unsafe.Pointer, size int) int {
-	dst_c := unsafe.Pointer(&dst[0])
-
-	if err := C.cudaMemcpy(dst_c, src_d, C.size_t(size), 2); err != 0 {
-		return -1
-	}
-	return 0
-}
--- a/goicicle/setup.sh
+++ b/goicicle/setup.sh
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-SUDO=''
-if [ "$EUID" != 0 ]; then 
-  echo "Icicle setup script should be run with root privileges, please run this as root"
-  SUDO='sudo'
-fi
-
-
-TARGET_BN254="libbn254.so"
-TARGET_BLS12_381="libbls12_381.so"
-TARGET_BLS12_377="libbls12_377.so"
-TARGET_BW6_671="libbw6_671.so"
-
-MAKE_FAIL=0
-
-$SUDO make $1 || MAKE_FAIL=1
-
-if [ $MAKE_FAIL != 0 ]; then
-    echo "make failed, install dependencies and re-run setup script with root privileges"
-    exit
-fi
-
-TARGET_BN254_PATH=$(dirname "$(find `pwd` -name $TARGET_BN254 -print -quit)")/
-TARGET_BLS12_381_PATH=$(dirname "$(find `pwd` -name $TARGET_BLS12_381 -print -quit)")/
-TARGET_BLS12_377_PATH=$(dirname "$(find `pwd` -name $TARGET_BLS12_377 -print -quit)")/
-TARGET_BW6_671_PATH=$(dirname "$(find `pwd` -name $TARGET_BW6_671 -print -quit)")/
-
-
-if [[ "$TARGET_BLS12_377_PATH" != "" ]]; then
-    echo "BLS12_377 found @ $TARGET_BLS12_377_PATH"
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BLS12_377_PATH
-fi
-
-if [[ "$TARGET_BN254_PATH" != "" ]]; then
-    echo "BN254 found @ $TARGET_BN254_PATH"
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BN254_PATH
-fi
-
-if [[ "$TARGET_BLS12_381_PATH" != "" ]]; then
-    echo "BLS12_381 found @ $TARGET_BLS12_381_PATH"
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BLS12_381_PATH
-fi
-
-if [[ "$TARGET_BW6_671_PATH" != "" ]]; then
-    echo "BW6_671 found @ $TARGET_BW6_671_PATH"
-    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$TARGET_BW6_671_PATH
-fi
--- a/goicicle/templates/curves/curves.go
+++ b/goicicle/templates/curves/curves.go
@@ -1,52 +0,0 @@
-package config
-
-// {{.SharedLib}}
-type Curve struct {
-	PackageName        string
-	CurveNameUpperCase string
-	CurveNameLowerCase string
-	SharedLib          string
-	ScalarSize         int
-	BaseSize           int
-	G2ElementSize      int
-}
-
-var BW6_761 = Curve{
-	PackageName:        "bw6761",
-	CurveNameUpperCase: "BW6761",
-	CurveNameLowerCase: "bw6_761",
-	SharedLib:          "-lbw6761",
-	ScalarSize:         12,
-	BaseSize:           24,
-	G2ElementSize:      6,
-}
-
-var BN_254 = Curve{
-	PackageName:        "bn254",
-	CurveNameUpperCase: "BN254",
-	CurveNameLowerCase: "bn254",
-	SharedLib:          "-lbn254",
-	ScalarSize:         8,
-	BaseSize:           8,
-	G2ElementSize:      4,
-}
-
-var BLS_12_377 = Curve{
-	PackageName:        "bls12377",
-	CurveNameUpperCase: "BLS12_377",
-	CurveNameLowerCase: "bls12_377",
-	SharedLib:          "-lbls12_377",
-	ScalarSize:         8,
-	BaseSize:           12,
-	G2ElementSize:      6,
-}
-
-var BLS_12_381 = Curve{
-	PackageName:        "bls12381",
-	CurveNameUpperCase: "BLS12_381",
-	CurveNameLowerCase: "bls12_381",
-	SharedLib:          "-lbls12_381",
-	ScalarSize:         8,
-	BaseSize:           12,
-	G2ElementSize:      6,
-}
--- a/goicicle/templates/curves/g1.go.tmpl
+++ b/goicicle/templates/curves/g1.go.tmpl
@@ -1,310 +0,0 @@
-import (
-	"unsafe"
-
-	"encoding/binary"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-const SCALAR_SIZE = {{.ScalarSize}}
-const BASE_SIZE = {{.BaseSize}}
-
-type G1ScalarField struct {
-	S [SCALAR_SIZE]uint32
-}
-
-type G1BaseField struct {
-	S [BASE_SIZE]uint32
-}
-
-/*
- * BaseField Constructors
- */
-
-func (f *G1BaseField) SetZero() *G1BaseField {
-	var S [BASE_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1BaseField) SetOne() *G1BaseField {
-	var S [BASE_SIZE]uint32
-
-	S[0] = 1
-
-	f.S = S
-	return f
-}
-
-func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
-	out := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(p))
-	in := (*C.{{.CurveNameUpperCase}}_affine_t)(unsafe.Pointer(affine))
-
-	C.projective_from_affine_{{.CurveNameLowerCase}}(out, in)
-
-	return p
-}
-
-func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
-	copy(f.S[:], limbs[:])
-
-	return f
-}
-
-/*
- * BaseField methods
- */
-
-func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1BaseField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * ScalarField methods
- */
-
-func (p *G1ScalarField) Random() *G1ScalarField {
-	outC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(p))
-	C.random_scalar_{{.CurveNameLowerCase}}(outC)
-
-	return p
-}
-
-func (f *G1ScalarField) SetZero() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	f.S = S
-
-	return f
-}
-
-func (f *G1ScalarField) SetOne() *G1ScalarField {
-	var S [SCALAR_SIZE]uint32
-	S[0] = 1
-	f.S = S
-
-	return f
-}
-
-func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
-	for i, v := range a.S {
-		if b.S[i] != v {
-			return false
-		}
-	}
-	return true
-}
-
-/*
- * ScalarField methods
- */
-
-func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
-	return f.S
-}
-
-func (f *G1ScalarField) ToBytesLe() []byte {
-	bytes := make([]byte, len(f.S)*4)
-	for i, v := range f.S {
-		binary.LittleEndian.PutUint32(bytes[i*4:], v)
-	}
-
-	return bytes
-}
-
-/*
- * Point{{.CurveNameUpperCase}}
- */
-
-type G1ProjectivePoint struct {
-	X, Y, Z G1BaseField
-}
-
-func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
-	var yOne G1BaseField
-	yOne.SetOne()
-
-	var xZero G1BaseField
-	xZero.SetZero()
-
-	var zZero G1BaseField
-	zZero.SetZero()
-
-	f.X = xZero
-	f.Y = yOne
-	f.Z = zZero
-
-	return f
-}
-
-func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
-	// Cast *Point{{.CurveNameUpperCase}} to *C.{{.CurveNameUpperCase}}_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It'S your responsibility to ensure that the types are compatible.
-	pC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it'S fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_{{.CurveNameLowerCase}}(pC, pCompareC))
-}
-
-func (p *G1ProjectivePoint) IsOnCurve() bool {
-	point := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(p))
-	res := C.projective_is_on_curve_{{.CurveNameLowerCase}}(point)
-
-	return bool(res)
-}
-
-func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
-	outC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(p))
-	C.random_projective_{{.CurveNameLowerCase}}(outC)
-
-	return p
-}
-
-func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
-	return &G1PointAffine{
-		X: p.X,
-		Y: p.Y,
-	}
-}
-
-func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
-	var _x G1BaseField
-	var _y G1BaseField
-	var _z G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(x))
-	_y.FromLimbs(GetFixedLimbs(y))
-	_z.FromLimbs(GetFixedLimbs(z))
-
-	p.X = _x
-	p.Y = _y
-	p.Z = _z
-
-	return p
-}
-
-/*
- * PointAffineNoInfinity{{.CurveNameUpperCase}}
- */
-
-type G1PointAffine struct {
-	X, Y G1BaseField
-}
-
-func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
-	in := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(projective))
-	out := (*C.{{.CurveNameUpperCase}}_affine_t)(unsafe.Pointer(p))
-
-	C.projective_to_affine_{{.CurveNameLowerCase}}(out, in)
-
-	return p
-}
-
-func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
-	var Z G1BaseField
-	Z.SetOne()
-
-	return &G1ProjectivePoint{
-		X: p.X,
-		Y: p.Y,
-		Z: Z,
-	}
-}
-
-func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
-	var _x G1BaseField
-	var _y G1BaseField
-
-	_x.FromLimbs(GetFixedLimbs(X))
-	_y.FromLimbs(GetFixedLimbs(Y))
-
-	p.X = _x
-	p.Y = _y
-
-	return p
-}
-
-/*
- * Multiplication
- */
-
-func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	pointsC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(&a[0]))
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_point_{{.CurveNameLowerCase}}(pointsC, scalarsC, nElementsC, deviceIdC)
-}
-
-func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	if len(a) != len(b) {
-		panic("a and b have different lengths")
-	}
-
-	aC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&b[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.vec_mod_mult_scalar_{{.CurveNameLowerCase}}(aC, bC, nElementsC, deviceIdC)
-}
-
-// Multiply a matrix by a scalar:
-//
-//	`a` - flattenned matrix;
-//	`b` - vector to multiply `a` by;
-func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
-	c := make([]G1ScalarField, len(b))
-	for i := range c {
-		var p G1ScalarField
-		p.SetZero()
-
-		c[i] = p
-	}
-
-	aC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&a[0]))
-	bC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&b[0]))
-	cC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&c[0]))
-	deviceIdC := C.size_t(deviceID)
-	nElementsC := C.size_t(len(a))
-
-	C.matrix_vec_mod_mult_{{.CurveNameLowerCase}}(aC, bC, cC, nElementsC, deviceIdC)
-}
-
-/*
- * Utils
- */
-
-func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
-	if len(*slice) <= BASE_SIZE {
-		limbs := [BASE_SIZE]uint32{}
-		copy(limbs[:len(*slice)], *slice)
-		return limbs
-	}
-
-	panic("slice has too many elements")
-}
--- a/goicicle/templates/curves/g1_test.go.tmpl
+++ b/goicicle/templates/curves/g1_test.go.tmpl
@@ -1,180 +0,0 @@
-import (
-	"encoding/binary"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewField{{.CurveNameUpperCase}}One(t *testing.T) {
-	var oneField G1BaseField
-	oneField.SetOne()
-
-	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, oneField.S, rawOneField)
-}
-
-func TestNewField{{.CurveNameUpperCase}}Zero(t *testing.T) {
-	var zeroField G1BaseField
-	zeroField.SetZero()
-
-	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
-
-	assert.Equal(t, zeroField.S, rawZeroField)
-}
-
-func TestField{{.CurveNameUpperCase}}ToBytesLe(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
-	for i, v := range p.X.S {
-		binary.LittleEndian.PutUint32(expected[i*4:], v)
-	}
-
-	assert.Equal(t, p.X.ToBytesLe(), expected)
-	assert.Equal(t, len(p.X.ToBytesLe()), 32)
-}
-
-func TestNewPoint{{.CurveNameUpperCase}}Zero(t *testing.T) {
-	var pointZero G1ProjectivePoint
-	pointZero.SetZero()
-
-	var baseOne G1BaseField
-	baseOne.SetOne()
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, pointZero.X, zeroSanity)
-	assert.Equal(t, pointZero.Y, baseOne)
-	assert.Equal(t, pointZero.Z, zeroSanity)
-}
-
-func TestFromProjectiveToAffine(t *testing.T) {
-	var projective G1ProjectivePoint
-	var affine G1PointAffine
-
-	projective.Random()
-
-	affine.FromProjective(&projective)
-	var projective2 G1ProjectivePoint
-	projective2.FromAffine(&affine)
-
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func Test{{.CurveNameUpperCase}}Eq(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-	var p2 G1ProjectivePoint
-	p2.Random()
-
-	assert.Equal(t, p1.Eq(&p1), true)
-	assert.Equal(t, p1.Eq(&p2), false)
-}
-
-func Test{{.CurveNameUpperCase}}StripZ(t *testing.T) {
-	var p1 G1ProjectivePoint
-	p1.Random()
-
-	p2ZLess := p1.StripZ()
-
-	assert.IsType(t, G1PointAffine{}, *p2ZLess)
-	assert.Equal(t, p1.X, p2ZLess.X)
-	assert.Equal(t, p1.Y, p2ZLess.Y)
-}
-
-func TestPoint{{.CurveNameUpperCase}}fromLimbs(t *testing.T) {
-	var p G1ProjectivePoint
-	p.Random()
-
-	x := p.X.Limbs()
-	y := p.Y.Limbs()
-	z := p.Z.Limbs()
-
-	xSlice := x[:]
-	ySlice := y[:]
-	zSlice := z[:]
-
-	var pFromLimbs G1ProjectivePoint
-	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
-
-	assert.Equal(t, pFromLimbs, p)
-}
-
-func TestNewPointAffineNoInfinity{{.CurveNameUpperCase}}Zero(t *testing.T) {
-	var zeroP G1PointAffine
-
-	var zeroSanity G1BaseField
-	zeroSanity.SetZero()
-
-	assert.Equal(t, zeroP.X, zeroSanity)
-	assert.Equal(t, zeroP.Y, zeroSanity)
-}
-
-func TestPointAffineNoInfinity{{.CurveNameUpperCase}}FromLimbs(t *testing.T) {
-	// Initialize your test values
-	x := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-	y := [8]uint32{9, 10, 11, 12, 13, 14, 15, 16}
-	xSlice := x[:]
-	ySlice := y[:]
-
-	// Execute your function
-	var result G1PointAffine
-	result.FromLimbs(&xSlice, &ySlice)
-
-	var xBase G1BaseField
-	var yBase G1BaseField
-	xBase.FromLimbs(x)
-	yBase.FromLimbs(y)
-
-	// Define your expected result
-	expected := G1PointAffine{
-		X: xBase,
-		Y: yBase,
-	}
-
-	// Test if result is as expected
-	assert.Equal(t, expected, result)
-}
-
-func TestGetFixedLimbs(t *testing.T) {
-	t.Run("case of valid input of length less than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of valid input of length 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
-		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of empty input", func(t *testing.T) {
-		slice := []uint32{}
-		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
-
-		result := GetFixedLimbs(&slice)
-		assert.Equal(t, result, expected)
-	})
-
-	t.Run("case of input length greater than 8", func(t *testing.T) {
-		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
-
-		defer func() {
-			if r := recover(); r == nil {
-				t.Errorf("the code did not panic")
-			}
-		}()
-
-		GetFixedLimbs(&slice)
-	})
-}
--- a/goicicle/templates/curves/g2.go.tmpl
+++ b/goicicle/templates/curves/g2.go.tmpl
@@ -1,85 +0,0 @@
-import (
-	"encoding/binary"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
-// #include "projective.h"
-// #include "ve_mod_mult.h"
-import "C"
-
-// G2 extension field
-
-type G2Element [{{.G2ElementSize}}]uint64
-
-type ExtentionField struct {
-	A0, A1 G2Element
-}
-
-type G2PointAffine struct {
-	X, Y ExtentionField
-}
-
-type G2Point struct {
-	X, Y, Z ExtentionField
-}
-
-func (p *G2Point) Random() *G2Point {
-	outC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
-	C.random_g2_projective_{{.CurveNameLowerCase}}(outC)
-
-	return p
-}
-
-
-func (p *G2Point) Eq(pCompare *G2Point) bool {
-	// Cast *Point{{.CurveNameUpperCase}} to *C.{{.CurveNameUpperCase}}_projective_t
-	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
-	// between different pointer types.
-	// It's your responsibility to ensure that the types are compatible.
-	pC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
-	pCompareC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(pCompare))
-
-	// Call the C function
-	// The C function doesn't keep any references to the data,
-	// so it's fine if the Go garbage collector moves or deletes the data later.
-	return bool(C.eq_g2_{{.CurveNameLowerCase}}(pC, pCompareC))
-}
-
-func (f *G2Element) ToBytesLe() []byte {
-	var bytes []byte
-	for _, val := range f {
-		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
-		binary.LittleEndian.PutUint64(buf, val)
-		bytes = append(bytes, buf...)
-	}
-	return bytes
-}
-
-func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
-	out := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
-	in := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(affine))
-
-	C.g2_projective_from_affine_{{.CurveNameLowerCase}}(out, in)
-
-	return p
-}
-
-func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
-	out := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(p))
-	in := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(projective))
-
-	C.g2_projective_to_affine_{{.CurveNameLowerCase}}(out, in)
-
-	return p
-}
-
-func (p *G2Point) IsOnCurve() bool {
-	// Directly copy memory from the C struct to the Go struct
-	point := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(p))
-	res := C.g2_projective_is_on_curve_{{.CurveNameLowerCase}}(point)
-
-	return bool(res)
-}
--- a/goicicle/templates/curves/g2_test.go.tmpl
+++ b/goicicle/templates/curves/g2_test.go.tmpl
@@ -1,61 +0,0 @@
-import (
-	"fmt"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestG2Eqg2(t *testing.T) {
-	var point G2Point
-
-	point.Random()
-
-	assert.True(t, point.Eq(&point))
-}
-
-func TestG2FromProjectiveToAffine(t *testing.T) {
-	var projective G2Point
-	projective.Random()
-
-	var affine G2PointAffine
-	affine.FromProjective(&projective)
-
-	var projective2 G2Point
-	projective2.FromAffine(&affine)
-	
-	assert.True(t, projective.IsOnCurve())
-	assert.True(t, projective2.IsOnCurve())
-	assert.True(t, projective.Eq(&projective2))
-}
-
-func TestG2Eqg2NotEqual(t *testing.T) {
-	var point G2Point
-	point.Random()
-
-	var point2 G2Point
-	point2.Random()
-
-	assert.False(t, point.Eq(&point2))
-}
-
-func TestG2ToBytes(t *testing.T) {
-	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
-	bytes := element.ToBytesLe()
-
-	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
-}
-
-func TestG2ShouldConvertToProjective(t *testing.T) {
-	fmt.Print() // this prevents the test from hanging. TODO: figure out why
-	var pointProjective G2Point
-	pointProjective.Random()
-	
-	var pointAffine G2PointAffine
-	pointAffine.FromProjective(&pointProjective)
-
-	var proj G2Point
-	proj.FromAffine(&pointAffine)
-
-	assert.True(t, proj.IsOnCurve())
-	assert.True(t, pointProjective.Eq(&proj))
-}
--- a/goicicle/templates/hfiles/msm.h.tmpl
+++ b/goicicle/templates/hfiles/msm.h.tmpl
@@ -1,84 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdbool.h>
-// msm.h
-
-#ifndef _{{.CurveNameUpperCase}}_MSM_H
-#define _{{.CurveNameUpperCase}}_MSM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of {{.CurveNameUpperCase}} projective and affine structs
-typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
-typedef struct {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
-typedef struct {{.CurveNameUpperCase}}_affine_t {{.CurveNameUpperCase}}_affine_t;
-typedef struct {{.CurveNameUpperCase}}_g2_affine_t {{.CurveNameUpperCase}}_g2_affine_t;
-typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;
-typedef cudaStream_t CudaStream_t;
-
-int msm_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* out, {{.CurveNameUpperCase}}_affine_t* points, {{.CurveNameUpperCase}}_scalar_t* scalars, size_t count, size_t device_id);
-
-int msm_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* out,
-  {{.CurveNameUpperCase}}_affine_t* points,
-  {{.CurveNameUpperCase}}_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-  {{.CurveNameUpperCase}}_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-  {{.CurveNameUpperCase}}_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id);
-
-int msm_g2_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_g2_projective_t* out,
-  {{.CurveNameUpperCase}}_g2_affine_t* points,
-  {{.CurveNameUpperCase}}_scalar_t* scalars,
-  size_t count,
-  size_t device_id);
-
-int msm_batch_g2_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_g2_projective_t* out,
-  {{.CurveNameUpperCase}}_g2_affine_t* points,
-  {{.CurveNameUpperCase}}_scalar_t* scalars,
-  size_t batch_size,
-  size_t msm_size,
-  size_t device_id);
-
-int commit_g2_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_g2_projective_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-  {{.CurveNameUpperCase}}_g2_affine_t* d_points,
-  size_t count,
-  unsigned large_bucket_factor,
-  size_t device_id);
-
-int commit_batch_g2_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_g2_projective_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_scalars,
-  {{.CurveNameUpperCase}}_g2_affine_t* d_points,
-  size_t count,
-  size_t batch_size,
-  size_t device_id,
-  cudaStream_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _{{.CurveNameUpperCase}}_MSM_H */
--- a/goicicle/templates/hfiles/ntt.h.tmpl
+++ b/goicicle/templates/hfiles/ntt.h.tmpl
@@ -1,181 +0,0 @@
-#include <cuda.h>
-#include <stdbool.h>
-// ntt.h
-
-#ifndef _{{.CurveNameUpperCase}}_NTT_H
-#define _{{.CurveNameUpperCase}}_NTT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Incomplete declaration of {{.CurveNameUpperCase}} projective and affine structs
-typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
-typedef struct {{.CurveNameUpperCase}}_affine_t {{.CurveNameUpperCase}}_affine_t;
-typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;
-
-typedef struct {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
-typedef struct {{.CurveNameUpperCase}}_g2_affine_t {{.CurveNameUpperCase}}_g2_affine_t;
-
-int ntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ntt_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-int ecntt_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
-int ecntt_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
-
-{{.CurveNameUpperCase}}_scalar_t* 
-build_domain_cuda_{{.CurveNameLowerCase}}(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
-
-int interpolate_scalars_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int interpolate_scalars_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int interpolate_points_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int interpolate_scalars_batch_on_coset_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_evaluations,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned n,
-  unsigned batch_size,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-
-int evaluate_scalars_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  size_t device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  unsigned device_id,
-  size_t stream);
-int evaluate_scalars_on_coset_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out,
-  {{.CurveNameUpperCase}}_scalar_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-int evaluate_points_on_coset_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* d_out,
-  {{.CurveNameUpperCase}}_projective_t* d_coefficients,
-  {{.CurveNameUpperCase}}_scalar_t* d_domain,
-  unsigned domain_size,
-  unsigned n,
-  unsigned batch_size,
-  {{.CurveNameUpperCase}}_scalar_t* coset_powers,
-  size_t device_id,
-  size_t stream);
-
-int reverse_order_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_scalars_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int reverse_order_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* arr, int n, size_t device_id, size_t stream);
-int reverse_order_points_batch_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
-int add_scalars_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
-int sub_scalars_cuda_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* d_out, {{.CurveNameUpperCase}}_scalar_t* d_in1, {{.CurveNameUpperCase}}_scalar_t* d_in2, unsigned n, size_t stream);
-int to_montgomery_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_scalars_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* d_inout, unsigned n, size_t stream);
-
-// points g1
-int to_montgomery_proj_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t* d_inout, unsigned n, size_t stream);
-
-// points g2
-int to_montgomery_proj_points_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_proj_points_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* d_inout, unsigned n, size_t stream);
-int to_montgomery_aff_points_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_affine_t* d_inout, unsigned n, size_t stream);
-int from_montgomery_aff_points_g2_cuda_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_affine_t* d_inout, unsigned n, size_t stream);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _{{.CurveNameUpperCase}}_NTT_H */
--- a/goicicle/templates/hfiles/projective.h.tmpl
+++ b/goicicle/templates/hfiles/projective.h.tmpl
@@ -1,33 +0,0 @@
-#include <cuda.h>
-#include <stdbool.h>
-// projective.h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
-typedef struct {{.CurveNameUpperCase}}_g2_projective_t {{.CurveNameUpperCase}}_g2_projective_t;
-typedef struct {{.CurveNameUpperCase}}_affine_t {{.CurveNameUpperCase}}_affine_t;
-typedef struct {{.CurveNameUpperCase}}_g2_affine_t {{.CurveNameUpperCase}}_g2_affine_t;
-typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;
-
-bool projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* point1);
-
-int random_scalar_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_scalar_t* out); 
-int random_projective_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* out);
-{{.CurveNameUpperCase}}_projective_t* projective_zero_{{.CurveNameLowerCase}}();
-int projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_affine_t* out, {{.CurveNameUpperCase}}_projective_t* point1);
-int projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* out, {{.CurveNameUpperCase}}_affine_t* point1);
-
-int random_g2_projective_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* out);
-int g2_projective_to_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_affine_t* out, {{.CurveNameUpperCase}}_g2_projective_t* point1);
-int g2_projective_from_affine_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* out, {{.CurveNameUpperCase}}_g2_affine_t* point1);
-bool g2_projective_is_on_curve_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* point1);
-
-bool eq_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_projective_t* point1, {{.CurveNameUpperCase}}_projective_t* point2);
-bool eq_g2_{{.CurveNameLowerCase}}({{.CurveNameUpperCase}}_g2_projective_t* point1, {{.CurveNameUpperCase}}_g2_projective_t* point2);
-
-#ifdef __cplusplus
-}
-#endif
--- a/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
+++ b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
@@ -1,32 +0,0 @@
-#include <stdbool.h>
-#include <cuda.h>
-// ve_mod_mult.h
-
-#ifndef _{{.CurveNameUpperCase}}_VEC_MULT_H
-#define _{{.CurveNameUpperCase}}_VEC_MULT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {{.CurveNameUpperCase}}_projective_t {{.CurveNameUpperCase}}_projective_t;
-typedef struct {{.CurveNameUpperCase}}_scalar_t {{.CurveNameUpperCase}}_scalar_t;
-
-int32_t vec_mod_mult_point_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_projective_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_scalar_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
-int32_t vec_mod_mult_device_scalar_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* inout, {{.CurveNameUpperCase}}_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
-int32_t matrix_vec_mod_mult_{{.CurveNameLowerCase}}(
-  {{.CurveNameUpperCase}}_scalar_t* matrix_flattened,
-  {{.CurveNameUpperCase}}_scalar_t* input,
-  {{.CurveNameUpperCase}}_scalar_t* output,
-  size_t n_elments,
-  size_t device_id);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _{{.CurveNameUpperCase}}_VEC_MULT_H */
--- a/goicicle/templates/main.go
+++ b/goicicle/templates/main.go
@@ -1,312 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-
-	"github.com/consensys/bavard"
-	config "github.com/ingonyama-zk/icicle/goicicle/templates/curves"
-)
-
-const (
-	copyrightHolder = "Ingonyama"
-	generatedBy     = "Ingonyama"
-	copyrightYear   = 2023
-	baseDir         = "../curves/"
-)
-
-var bgen = bavard.NewBatchGenerator(copyrightHolder, copyrightYear, generatedBy)
-
-func genMainFiles() {
-	bn254_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "g1.go"), Templates: []string{"g1.go.tmpl"}},
-	}
-
-	bls12377_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "g1.go"), Templates: []string{"g1.go.tmpl"}},
-	}
-
-	bls12381_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "g1.go"), Templates: []string{"g1.go.tmpl"}},
-	}
-
-	bw6761_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "g1.go"), Templates: []string{"g1.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries...))
-
-	bn254_g2_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "g2.go"), Templates: []string{"g2.go.tmpl"}},
-	}
-
-	bls12377_g2_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "g2.go"), Templates: []string{"g2.go.tmpl"}},
-	}
-
-	bls12381_g2_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "g2.go"), Templates: []string{"g2.go.tmpl"}},
-	}
-
-	bw6761_g2_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "g2.go"), Templates: []string{"g2.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_g2_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_g2_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_g2_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_g2_entries...))
-
-	bn254_msm_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "msm.go"), Templates: []string{"msm.go.tmpl"}},
-	}
-
-	bls12377_msm_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "msm.go"), Templates: []string{"msm.go.tmpl"}},
-	}
-
-	bls12381_msm_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "msm.go"), Templates: []string{"msm.go.tmpl"}},
-	}
-
-	bw6761_msm_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "msm.go"), Templates: []string{"msm.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_msm_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_msm_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_msm_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./msm/", bw6761_msm_entries...))
-
-	bn254_ntt_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
-	}
-
-	bls12377_ntt_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
-	}
-
-	bls12381_ntt_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
-	}
-
-	bw6761_ntt_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "ntt.go"), Templates: []string{"ntt.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_ntt_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_ntt_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_ntt_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ntt/", bw6761_ntt_entries...))
-
-	bn254_vec_mod_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
-	}
-
-	bls12377_vec_mod_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
-	}
-
-	bls12381_vec_mod_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
-	}
-
-	bw6761_vec_mod_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "vec_mod.go"), Templates: []string{"vec_mod.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ops/", bls12377_vec_mod_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ops/", bn254_vec_mod_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ops/", bls12381_vec_mod_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ops/", bw6761_vec_mod_entries...))
-
-	h_msm_bn254 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
-	}
-
-	h_msm_bls12377 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
-	}
-
-	h_msm_bls12381 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
-	}
-
-	h_msm_bw6761 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "include", "msm.h"), Templates: []string{"msm.h.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_msm_bls12377...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_msm_bn254...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_msm_bls12381...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", h_msm_bw6761...))
-
-	h_ntt_bn254 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
-	}
-
-	h_ntt_bls12377 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
-	}
-
-	h_ntt_bls12381 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
-	}
-
-	h_ntt_bw6761 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "include", "ntt.h"), Templates: []string{"ntt.h.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_ntt_bls12377...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_ntt_bn254...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_ntt_bls12381...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", h_ntt_bw6761...))
-
-	ve_mod_mult_h_bn254 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
-	}
-
-	ve_mod_mult_h_bls12377 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
-	}
-
-	ve_mod_mult_ht_bls12381 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
-	}
-
-	ve_mod_mult_ht_bw6761 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "include", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", ve_mod_mult_h_bls12377...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", ve_mod_mult_h_bn254...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", ve_mod_mult_ht_bls12381...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", ve_mod_mult_ht_bw6761...))
-
-	projective_bn254 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
-	}
-
-	projective_bls12377 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
-	}
-
-	projective_bls12381 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
-	}
-
-	projective_bw6761 := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "include", "projective.h"), Templates: []string{"projective.h.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", projective_bls12377...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", projective_bn254...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", projective_bls12381...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./hfiles/", projective_bw6761...))
-}
-
-func genTestFiles() {
-	// G1 TESTS
-	bn254_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
-	}
-
-	bls12377_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
-	}
-
-	bls12381_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
-	}
-
-	bw6761_entries := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "g1_test.go"), Templates: []string{"g1_test.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries...))
-
-	// G2 TESTS
-	bn254_entries_g2_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
-	}
-
-	bls12377_entries_g2_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
-	}
-
-	bls12381_entries_g2_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
-	}
-
-	bw6761_entries_g2_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "g2_test.go"), Templates: []string{"g2_test.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries_g2_test...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries_g2_test...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries_g2_test...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./curves/", bw6761_entries_g2_test...))
-
-	// MSM TEST
-	bn254_entries_msm_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
-	}
-
-	bls12377_entries_msm_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
-	}
-
-	bls12381_entries_msm_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
-	}
-
-	bw6761_entries_msm_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "msm_test.go"), Templates: []string{"msm_test.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_entries_msm_test...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_entries_msm_test...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_entries_msm_test...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./msm/", bw6761_entries_msm_test...))
-
-	// FFT TEST
-	bn254_entries_fft_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bn254", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
-	}
-
-	bls12377_entries_fft_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12377", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
-	}
-
-	bls12381_entries_fft_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bls12381", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
-	}
-
-	bw6761_entries_msm_test_entries_fft_test := []bavard.Entry{
-		{File: filepath.Join(baseDir, "bw6761", "ntt_test.go"), Templates: []string{"ntt_test.go.tmpl"}},
-	}
-
-	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_entries_fft_test...))
-	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_entries_fft_test...))
-	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_entries_fft_test...))
-	assertNoError(bgen.Generate(config.BW6_761, config.BW6_761.PackageName, "./ntt/", bw6761_entries_msm_test_entries_fft_test...))
-}
-
-func main() {
-	genMainFiles()
-	genTestFiles()
-}
-
-func assertNoError(err error) {
-	if err != nil {
-		fmt.Printf("\n%s\n", err.Error())
-		os.Exit(-1)
-	}
-}
--- a/goicicle/templates/msm/msm.go.tmpl
+++ b/goicicle/templates/msm/msm.go.tmpl
@@ -1,191 +0,0 @@
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
-// #include "msm.h"
-import "C"
-
-func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.{{.CurveNameUpperCase}}_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(out))
-	ret := C.msm_cuda_{{.CurveNameLowerCase}}(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_cuda_{{.CurveNameLowerCase}} returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
-	if len(points) != len(scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	pointsC := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(&points[0]))
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&scalars[0]))
-	outC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(out))
-
-	ret := C.msm_g2_cuda_{{.CurveNameLowerCase}}(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
-
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_g2_cuda_{{.CurveNameLowerCase}} returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G1ProjectivePoint, batchSize)
-
-	for i := 0; i < len(out); i++ {
-		var p G1ProjectivePoint
-		p.SetZero()
-
-		out[i] = p
-	}
-
-	outC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.{{.CurveNameUpperCase}}_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_cuda_{{.CurveNameLowerCase}}(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_{{.CurveNameLowerCase}} returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
-	// Check for nil pointers
-	if points == nil || scalars == nil {
-		return nil, errors.New("points or scalars is nil")
-	}
-
-	if len(*points) != len(*scalars) {
-		return nil, errors.New("error on: len(points) != len(scalars)")
-	}
-
-	// Check for empty slices
-	if len(*points) == 0 || len(*scalars) == 0 {
-		return nil, errors.New("points or scalars is empty")
-	}
-
-	// Check for zero batchSize
-	if batchSize <= 0 {
-		return nil, errors.New("error on: batchSize must be greater than zero")
-	}
-
-	out := make([]G2Point, batchSize)
-
-	outC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(unsafe.Pointer(&out[0]))
-	pointsC := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	msmSizeC := C.size_t(len(*points) / batchSize)
-	deviceIdC := C.size_t(deviceId)
-	batchSizeC := C.size_t(batchSize)
-
-	ret := C.msm_batch_g2_cuda_{{.CurveNameLowerCase}}(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
-	if ret != 0 {
-		return nil, fmt.Errorf("msm_batch_cuda_{{.CurveNameLowerCase}} returned error code: %d", ret)
-	}
-
-	return out, nil
-}
-
-func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.{{.CurveNameUpperCase}}_projective_t)(d_out)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	pointsC := (*C.{{.CurveNameUpperCase}}_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_cuda_{{.CurveNameLowerCase}}(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
-	d_outC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(d_out)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	pointsC := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	largeBucketFactorC := C.uint(bucketFactor)
-
-	ret := C.commit_g2_cuda_{{.CurveNameLowerCase}}(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.{{.CurveNameUpperCase}}_projective_t)(d_out)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	pointsC := (*C.{{.CurveNameUpperCase}}_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.commit_batch_cuda_{{.CurveNameLowerCase}}(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
-
-func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
-	d_outC := (*C.{{.CurveNameUpperCase}}_g2_projective_t)(d_out)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	pointsC := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(d_points)
-	countC := (C.size_t)(count)
-	batch_sizeC := (C.size_t)(batch_size)
-
-	ret := C.msm_batch_g2_cuda_{{.CurveNameLowerCase}}(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
-
-	if ret != 0 {
-		return -1
-	}
-
-	return 0
-}
--- a/goicicle/templates/msm/msm_test.go.tmpl
+++ b/goicicle/templates/msm/msm_test.go.tmpl
@@ -1,342 +0,0 @@
-import (
-	"fmt"
-	"math"
-	"testing"
-	"time"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-	"github.com/stretchr/testify/assert"
-)
-
-func GeneratePoints(count int) []G1PointAffine {
-	// Declare a slice of integers
-	var points []G1PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-		var pointProjective G1ProjectivePoint
-		pointProjective.Random()
-
-		var pointAffine G1PointAffine
-		pointAffine.FromProjective(&pointProjective)
-
-		points = append(points, pointAffine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func GeneratePointsProj(count int) []G1ProjectivePoint {
-	// Declare a slice of integers
-	var points []G1ProjectivePoint
-	// Use a loop to populate the slice
-	for i := 0; i < count; i++ {
-		var p G1ProjectivePoint
-		p.Random()
-
-		points = append(points, p)
-	}
-
-	return points
-}
-
-func GenerateScalars(count int, skewed bool) []G1ScalarField {
-	// Declare a slice of integers
-	var scalars []G1ScalarField
-
-	var rand G1ScalarField
-	var zero G1ScalarField
-	var one G1ScalarField
-	var randLarge G1ScalarField
-
-	zero.SetZero()
-	one.SetOne()
-	randLarge.Random()
-
-	if skewed && count > 1_200_000 {
-		for i := 0; i < count-1_200_000; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-
-		for i := 0; i < 600_000; i++ {
-			scalars = append(scalars, randLarge)
-		}
-		for i := 0; i < 400_000; i++ {
-			scalars = append(scalars, zero)
-		}
-		for i := 0; i < 200_000; i++ {
-			scalars = append(scalars, one)
-		}
-	} else {
-		for i := 0; i < count; i++ {
-			rand.Random()
-			scalars = append(scalars, rand)
-		}
-	}
-
-	return scalars[:count]
-}
-
-func TestMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G1ProjectivePoint)
-		startTime := time.Now()
-		_, e := Msm(out, points, scalars, 0) // non mont
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		assert.Equal(t, e, nil, "error should be nil")
-
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func TestCommitMSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1<<v - 1
-
-		points := GeneratePoints(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := count * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := Commit(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G1ProjectivePoint, 1)
-		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.True(t, outHost[0].IsOnCurve())
-	}
-}
-
-func BenchmarkCommit(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-
-		out_d, _ := goicicle.CudaMalloc(96)
-
-		pointsBytes := msmSize * 64
-		points_d, _ := goicicle.CudaMalloc(pointsBytes)
-		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
-
-		scalarBytes := msmSize * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
-
-				if e != 0 {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestBatchMSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GeneratePoints(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatch{{.CurveNameUpperCase}} returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-            for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
-
-func BenchmarkMSM(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GeneratePoints(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G1ProjectivePoint)
-				_, e := Msm(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-// G2
-func GenerateG2Points(count int) []G2PointAffine {
-	// Declare a slice of integers
-	var points []G2PointAffine
-
-	// populate the slice
-	for i := 0; i < 10; i++ {
-        fmt.Print() // this prevents the test from hanging. TODO: figure out why
-		var p G2Point
-		p.Random()
-		var affine G2PointAffine
-		affine.FromProjective(&p)
-
-		points = append(points, affine)
-	}
-
-	log2_10 := math.Log2(10)
-	log2Count := math.Log2(float64(count))
-	log2Size := int(math.Ceil(log2Count - log2_10))
-
-	for i := 0; i < log2Size; i++ {
-		points = append(points, points...)
-	}
-
-	return points[:count]
-}
-
-func TestMsmG2{{.CurveNameUpperCase}}(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		out := new(G2Point)
-		_, e := MsmG2(out, points, scalars, 0)
-		assert.Equal(t, e, nil, "error should be nil")
-		assert.True(t, out.IsOnCurve())
-	}
-}
-
-func BenchmarkMsmG2{{.CurveNameUpperCase}}(b *testing.B) {
-	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
-
-	for _, logMsmSize := range LOG_MSM_SIZES {
-		msmSize := 1 << logMsmSize
-		points := GenerateG2Points(msmSize)
-		scalars := GenerateScalars(msmSize, false)
-		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
-			for n := 0; n < b.N; n++ {
-				out := new(G2Point)
-				_, e := MsmG2(out, points, scalars, 0)
-
-				if e != nil {
-					panic("Error occurred")
-				}
-			}
-		})
-	}
-}
-
-func TestCommitG2MSM(t *testing.T) {
-	for _, v := range []int{8} {
-		count := 1 << v
-
-		points := GenerateG2Points(count)
-		fmt.Print("Finished generating points\n")
-		scalars := GenerateScalars(count, false)
-		fmt.Print("Finished generating scalars\n")
-
-		var sizeCheckG2PointAffine G2PointAffine
-		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
-
-		var sizeCheckG2Point G2Point
-		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
-		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
-
-		scalarBytes := count * 32
-		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
-		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
-
-		startTime := time.Now()
-		e := CommitG2(out_d, scalars_d, points_d, count, 10)
-		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
-
-		outHost := make([]G2Point, 1)
-		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
-
-		assert.Equal(t, e, 0, "error should be 0")
-		assert.Equal(t, len(outHost), 1)
-		result := outHost[0]
-
-		assert.True(t, result.IsOnCurve())
-	}
-}
-
-func TestBatchG2MSM(t *testing.T) {
-	for _, batchPow2 := range []int{2, 4} {
-		for _, pow2 := range []int{4, 6} {
-			msmSize := 1 << pow2
-			batchSize := 1 << batchPow2
-			count := msmSize * batchSize
-
-			points := GenerateG2Points(count)
-			scalars := GenerateScalars(count, false)
-
-			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
-
-			if e != nil {
-				t.Errorf("MsmBatch{{.CurveNameUpperCase}} returned an error: %v", e)
-			}
-
-			if len(pointsResults) != batchSize {
-				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
-			}
-
-			for _, s := range pointsResults {
-				assert.True(t, s.IsOnCurve())
-			}
-		}
-	}
-}
--- a/goicicle/templates/ntt/ntt.go.tmpl
+++ b/goicicle/templates/ntt/ntt.go.tmpl
@@ -1,204 +0,0 @@
-import (
-	"errors"
-	"fmt"
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/goicicle"
-)
-
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
-// #include "ntt.h"
-import "C"
-
-const (
-	NONE = 0
-	DIF  = 1
-	DIT  = 2
-)
-
-func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-
-	ret := C.ntt_cuda_{{.CurveNameLowerCase}}(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
-
-	return uint64(ret)
-}
-
-func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
-	isInverseC := C.bool(isInverse)
-	batchSizeC := C.uint32_t(batchSize)
-	deviceIdC := C.size_t(deviceId)
-
-	ret := C.ntt_batch_cuda_{{.CurveNameLowerCase}}(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
-	valuesC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-
-	ret := C.ecntt_cuda_{{.CurveNameLowerCase}}(valuesC, n, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
-	valuesC := (*C.{{.CurveNameUpperCase}}_projective_t)(unsafe.Pointer(&(*values)[0]))
-	deviceIdC := C.size_t(deviceId)
-	isInverseC := C.bool(isInverse)
-	n := C.uint32_t(len(*values))
-	batchSizeC := C.uint32_t(batchSize)
-
-	ret := C.ecntt_batch_cuda_{{.CurveNameLowerCase}}(valuesC, n, batchSizeC, isInverseC, deviceIdC)
-
-	return uint64(ret)
-}
-
-func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
-	domain_size := C.uint32_t(d_size)
-	logn := C.uint32_t(log_d_size)
-	is_inverse := C.bool(inverse)
-
-	dp := C.build_domain_cuda_{{.CurveNameLowerCase}}(domain_size, logn, is_inverse, 0, 0)
-
-	if dp == nil {
-		err = errors.New("nullptr returned from generating twiddles")
-		return unsafe.Pointer(nil), err
-	}
-
-	return unsafe.Pointer(dp), nil
-}
-
-// Reverses d_scalars in-place
-func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	lenC := C.int(len)
-	if success := C.reverse_order_scalars_cuda_{{.CurveNameLowerCase}}(scalarsC, lenC, 0, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
-	size_d := size * 32
-	dp, err := goicicle.CudaMalloc(size_d)
-
-	if err != nil {
-		return nil
-	}
-
-	d_out := (*C.{{.CurveNameUpperCase}}_scalar_t)(dp)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(scalars)
-	twiddlesC := (*C.{{.CurveNameUpperCase}}_scalar_t)(twiddles)
-	cosetPowersC := (*C.{{.CurveNameUpperCase}}_scalar_t)(cosetPowers)
-	sizeC := C.uint(size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.interpolate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
-	} else {
-		ret = C.interpolate_scalars_cuda_{{.CurveNameLowerCase}}(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
-	}
-	if ret != 0 {
-		fmt.Print("error interpolating")
-	}
-
-	return unsafe.Pointer(d_out)
-}
-
-func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
-	scalars_outC := (*C.{{.CurveNameUpperCase}}_scalar_t)(scalars_out)
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(scalars)
-	twiddlesC := (*C.{{.CurveNameUpperCase}}_scalar_t)(twiddles)
-	coset_powersC := (*C.{{.CurveNameUpperCase}}_scalar_t)(coset_powers)
-	sizeC := C.uint(scalars_size)
-	twiddlesC_size := C.uint(twiddles_size)
-
-	var ret C.int
-	if isCoset {
-		ret = C.evaluate_scalars_on_coset_cuda_{{.CurveNameLowerCase}}(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
-	} else {
-		ret = C.evaluate_scalars_cuda_{{.CurveNameLowerCase}}(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
-	}
-
-	if ret != 0 {
-		fmt.Print("error interpolating")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.{{.CurveNameUpperCase}}_scalar_t)(in1_d)
-	in2_dC := (*C.{{.CurveNameUpperCase}}_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.add_scalars_cuda_{{.CurveNameLowerCase}}(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error adding scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
-	in1_dC := (*C.{{.CurveNameUpperCase}}_scalar_t)(in1_d)
-	in2_dC := (*C.{{.CurveNameUpperCase}}_scalar_t)(in2_d)
-	sizeC := C.uint(size)
-
-	ret := C.sub_scalars_cuda_{{.CurveNameLowerCase}}(in1_dC, in1_dC, in2_dC, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error subtracting scalar vectors")
-		return -1
-	}
-
-	return 0
-}
-
-func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.to_montgomery_scalars_cuda_{{.CurveNameLowerCase}}(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
-	scalarsC := (*C.{{.CurveNameUpperCase}}_scalar_t)(d_scalars)
-	lenC := C.uint(len)
-	if success := C.from_montgomery_scalars_cuda_{{.CurveNameLowerCase}}(scalarsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.{{.CurveNameUpperCase}}_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_cuda_{{.CurveNameLowerCase}}(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
-
-func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
-	pointsC := (*C.{{.CurveNameUpperCase}}_g2_affine_t)(d_points)
-	lenC := C.uint(len)
-
-	if success := C.from_montgomery_aff_points_g2_cuda_{{.CurveNameLowerCase}}(pointsC, lenC, 0); success != 0 {
-		return -1, errors.New("reversing failed")
-	}
-	return 0, nil
-}
--- a/goicicle/templates/ntt/ntt_test.go.tmpl
+++ b/goicicle/templates/ntt/ntt_test.go.tmpl
@@ -1,130 +0,0 @@
-import (
-	"fmt"
-	"github.com/stretchr/testify/assert"
-	"reflect"
-	"testing"
-)
-
-func TestNtt{{.CurveNameUpperCase}}Batch(t *testing.T) {
-	count := 1 << 20
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	NttBatch(&nttResult, false, count, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNtt{{.CurveNameUpperCase}}CompareToGnarkDIF(t *testing.T) {
-	count := 1 << 2
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestINtt{{.CurveNameUpperCase}}CompareToGnarkDIT(t *testing.T) {
-	count := 1 << 3
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, true, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	assert.Equal(t, nttResult, nttResult)
-}
-
-func TestNtt{{.CurveNameUpperCase}}(t *testing.T) {
-	count := 1 << 3
-
-	scalars := GenerateScalars(count, false)
-
-	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-	copy(nttResult, scalars)
-
-	assert.Equal(t, nttResult, scalars)
-	Ntt(&nttResult, false, 0)
-	assert.NotEqual(t, nttResult, scalars)
-
-	inttResult := make([]G1ScalarField, len(nttResult))
-	copy(inttResult, nttResult)
-
-	assert.Equal(t, inttResult, nttResult)
-	Ntt(&inttResult, true, 0)
-	assert.Equal(t, inttResult, scalars)
-}
-
-func TestNttBatch{{.CurveNameUpperCase}}(t *testing.T) {
-	count := 1 << 5
-	batches := 4
-
-	scalars := GenerateScalars(count*batches, false)
-
-	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
-
-	for i := 0; i < batches; i++ {
-		start := i * count
-		end := (i + 1) * count
-		batch := make([]G1ScalarField, len(scalars[start:end]))
-		copy(batch, scalars[start:end])
-		scalarVecOfVec = append(scalarVecOfVec, batch)
-	}
-
-	nttBatchResult := make([]G1ScalarField, len(scalars))
-	copy(nttBatchResult, scalars)
-
-	NttBatch(&nttBatchResult, false, count, 0)
-
-	var nttResultVecOfVec [][]G1ScalarField
-
-	for i := 0; i < batches; i++ {
-		// Clone the slice
-		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
-		copy(clone, scalarVecOfVec[i])
-
-		// Add it to the result vector of vectors
-		nttResultVecOfVec = append(nttResultVecOfVec, clone)
-
-		// Call the ntt_{{.CurveNameLowerCase}} function
-		Ntt(&nttResultVecOfVec[i], false, 0)
-	}
-
-	assert.NotEqual(t, nttBatchResult, scalars)
-
-	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
-	for i := 0; i < batches; i++ {
-		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
-			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
-		}
-	}
-}
-
-func BenchmarkNTT(b *testing.B) {
-	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
-
-	for _, logNTTSize := range LOG_NTT_SIZES {
-		nttSize := 1 << logNTTSize
-		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
-			scalars := GenerateScalars(nttSize, false)
-
-			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
-			copy(nttResult, scalars)
-			for n := 0; n < b.N; n++ {
-				Ntt(&nttResult, false, 0)
-			}
-		})
-	}
-}
--- a/goicicle/templates/ops/vec_mod.go.tmpl
+++ b/goicicle/templates/ops/vec_mod.go.tmpl
@@ -1,24 +0,0 @@
-// #cgo CFLAGS: -I./include/
-// #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
-// #include "ve_mod_mult.h"
-import "C"
-import (
-	"fmt"
-	"unsafe"
-)
-
-func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
-	scalarVec1C := (*C.{{.CurveNameUpperCase}}_scalar_t)(scalarVec1)
-	scalarVec2C := (*C.{{.CurveNameUpperCase}}_scalar_t)(scalarVec2)
-	sizeC := C.size_t(size)
-
-	ret := C.vec_mod_mult_device_scalar_{{.CurveNameLowerCase}}(scalarVec1C, scalarVec2C, sizeC, 0)
-
-	if ret != 0 {
-		fmt.Print("error multiplying scalar vectors")
-		return -1
-	}
-
-	return 0
-}
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -68,8 +68,9 @@ include_directories("${CMAKE_SOURCE_DIR}")


 # when adding a new curve/field, append its name to the end of this list
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761)
-set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761)
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)

 set(IS_CURVE_SUPPORTED FALSE)
 set(I 0)
@@ -100,6 +101,11 @@ if (NOT BUILD_TESTS)
    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
  endif()

+  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
+  endif()
+
  add_library(
    icicle
    utils/vec_ops.cu
@@ -107,8 +113,6 @@ if (NOT BUILD_TESTS)
    primitives/field.cu
    primitives/projective.cu
    appUtils/msm/msm.cu
-    appUtils/ntt/ntt.cu
-    appUtils/ntt/kernel_ntt.cu
    ${ICICLE_SOURCES}
  )
  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
--- a/icicle/appUtils/ntt/kernel_ntt.cu
+++ b/icicle/appUtils/ntt/kernel_ntt.cu
@@ -6,12 +6,12 @@

 namespace ntt {

-  static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit)
+  static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
  {
    uint32_t rev_num = 0, temp, dig_len;
    if (dit) {
      for (int i = 4; i >= 0; i--) {
-        dig_len = STAGE_SIZES_DEVICE[log_size][i];
+        dig_len = fast_tw ? STAGE_SIZES_DEVICE_FT[log_size][i] : STAGE_SIZES_DEVICE[log_size][i];
        temp = num & ((1 << dig_len) - 1);
        num = num >> dig_len;
        rev_num = rev_num << dig_len;
@@ -19,7 +19,7 @@ namespace ntt {
      }
    } else {
      for (int i = 0; i < 5; i++) {
-        dig_len = STAGE_SIZES_DEVICE[log_size][i];
+        dig_len = fast_tw ? STAGE_SIZES_DEVICE_FT[log_size][i] : STAGE_SIZES_DEVICE[log_size][i];
        temp = num & ((1 << dig_len) - 1);
        num = num >> dig_len;
        rev_num = rev_num << dig_len;
@@ -33,18 +33,18 @@ namespace ntt {

  enum eRevType { None, RevToMixedRev, MixedRevToRev, NaturalToMixedRev, NaturalToRev, MixedRevToNatural };

-  static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, eRevType rev_type)
+  static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
  {
    switch (rev_type) {
    case eRevType::RevToMixedRev:
      // R -> N -> MR
-      return dig_rev(bit_rev(num, log_size), log_size, dit);
+      return dig_rev(bit_rev(num, log_size), log_size, dit, fast_tw);
    case eRevType::MixedRevToRev:
      // MR -> N -> R
-      return bit_rev(dig_rev(num, log_size, dit), log_size);
+      return bit_rev(dig_rev(num, log_size, dit, fast_tw), log_size);
    case eRevType::NaturalToMixedRev:
    case eRevType::MixedRevToNatural:
-      return dig_rev(num, log_size, dit);
+      return dig_rev(num, log_size, dit, fast_tw);
    case eRevType::NaturalToRev:
      return bit_rev(num, log_size);
    default:
@@ -56,7 +56,7 @@ namespace ntt {
  // Note: the following reorder kernels are fused with normalization for INTT
  template <typename E, typename S, uint32_t MAX_GROUP_SIZE = 80>
  static __global__ void reorder_digits_inplace_and_normalize_kernel(
-    E* arr, uint32_t log_size, bool dit, eRevType rev_type, bool is_normalize, S inverse_N)
+    E* arr, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type, bool is_normalize, S inverse_N)
  {
    // launch N threads (per batch element)
    // each thread starts from one index and calculates the corresponding group
@@ -74,7 +74,7 @@ namespace ntt {

    uint32_t i = 1;
    for (; i < MAX_GROUP_SIZE;) {
-      next_element = generalized_rev(next_element, log_size, dit, rev_type);
+      next_element = generalized_rev(next_element, log_size, dit, fast_tw, rev_type);
      if (next_element < idx) return; // not handling this group
      if (next_element == idx) break; // calculated whole group
      group[i++] = next_element + size * batch_idx;
@@ -91,12 +91,19 @@ namespace ntt {

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void reorder_digits_and_normalize_kernel(
-    E* arr, E* arr_reordered, uint32_t log_size, bool dit, eRevType rev_type, bool is_normalize, S inverse_N)
+    E* arr,
+    E* arr_reordered,
+    uint32_t log_size,
+    bool dit,
+    bool fast_tw,
+    eRevType rev_type,
+    bool is_normalize,
+    S inverse_N)
  {
    uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
    uint32_t rd = tid;
    uint32_t wr =
-      ((tid >> log_size) << log_size) + generalized_rev(tid & ((1 << log_size) - 1), log_size, dit, rev_type);
+      ((tid >> log_size) << log_size) + generalized_rev(tid & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
    arr_reordered[wr] = is_normalize ? arr[rd] * inverse_N : arr[rd];
  }

@@ -116,7 +123,7 @@ namespace ntt {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= n_elements * batch_size) return;
    int64_t scalar_id = tid % n_elements;
-    if (rev_type != eRevType::None) scalar_id = generalized_rev(tid, logn, dit, rev_type);
+    if (rev_type != eRevType::None) scalar_id = generalized_rev(tid, logn, dit, false, rev_type);
    out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
  }

@@ -136,7 +143,8 @@ namespace ntt {
    bool strided,
    uint32_t stage_num,
    bool inv,
-    bool dit)
+    bool dit,
+    bool fast_tw)
  {
    NTTEngine<E, S> engine;
    stage_metadata s_meta;
@@ -150,14 +158,23 @@ namespace ntt {

    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

-    engine.loadBasicTwiddles(basic_twiddles, inv);
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride && dit) {
-      engine.loadExternalTwiddlesGeneric64(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric64(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    engine.loadInternalTwiddles64(internal_twiddles, strided, inv);
+    if (fast_tw)
+      engine.loadInternalTwiddles64(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric64(internal_twiddles, strided, inv);

 #pragma unroll 1
    for (uint32_t phase = 0; phase < 2; phase++) {
@@ -171,8 +188,11 @@ namespace ntt {
    }

    if (twiddle_stride && !dit) {
-      engine.loadExternalTwiddlesGeneric64(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric64(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
@@ -194,7 +214,8 @@ namespace ntt {
    bool strided,
    uint32_t stage_num,
    bool inv,
-    bool dit)
+    bool dit,
+    bool fast_tw)
  {
    NTTEngine<E, S> engine;
    stage_metadata s_meta;
@@ -209,9 +230,15 @@ namespace ntt {

    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

-    engine.loadBasicTwiddles(basic_twiddles, inv);
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
-    engine.loadInternalTwiddles32(internal_twiddles, strided, inv);
+    if (fast_tw)
+      engine.loadInternalTwiddles32(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric32(internal_twiddles, strided, inv);
    engine.ntt8win();
    engine.twiddlesInternal();
    engine.SharedData32Columns8(shmem, true, false, strided); // store
@@ -219,8 +246,11 @@ namespace ntt {
    engine.SharedData32Rows4_2(shmem, false, false, strided); // load
    engine.ntt4_2();
    if (twiddle_stride) {
-      engine.loadExternalTwiddlesGeneric32(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric32(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
    engine.storeGlobalData32(out, data_stride, log_data_stride, log_size, strided, s_meta);
@@ -242,7 +272,8 @@ namespace ntt {
    bool strided,
    uint32_t stage_num,
    bool inv,
-    bool dit)
+    bool dit,
+    bool fast_tw)
  {
    NTTEngine<E, S> engine;
    stage_metadata s_meta;
@@ -257,14 +288,23 @@ namespace ntt {

    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

-    engine.loadBasicTwiddles(basic_twiddles, inv);
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
    engine.loadGlobalData32(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride) {
-      engine.loadExternalTwiddlesGeneric32(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric32(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    engine.loadInternalTwiddles32(internal_twiddles, strided, inv);
+    if (fast_tw)
+      engine.loadInternalTwiddles32(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric32(internal_twiddles, strided, inv);
    engine.ntt4_2();
    engine.SharedData32Columns4_2(shmem, true, false, strided); // store
    __syncthreads();
@@ -290,7 +330,8 @@ namespace ntt {
    bool strided,
    uint32_t stage_num,
    bool inv,
-    bool dit)
+    bool dit,
+    bool fast_tw)
  {
    NTTEngine<E, S> engine;
    stage_metadata s_meta;
@@ -305,9 +346,15 @@ namespace ntt {

    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

-    engine.loadBasicTwiddles(basic_twiddles, inv);
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
-    engine.loadInternalTwiddles16(internal_twiddles, strided, inv);
+    if (fast_tw)
+      engine.loadInternalTwiddles16(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric16(internal_twiddles, strided, inv);
    engine.ntt8win();
    engine.twiddlesInternal();
    engine.SharedData16Columns8(shmem, true, false, strided); // store
@@ -315,8 +362,11 @@ namespace ntt {
    engine.SharedData16Rows2_4(shmem, false, false, strided); // load
    engine.ntt2_4();
    if (twiddle_stride) {
-      engine.loadExternalTwiddlesGeneric16(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric16(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
    engine.storeGlobalData16(out, data_stride, log_data_stride, log_size, strided, s_meta);
@@ -338,7 +388,8 @@ namespace ntt {
    bool strided,
    uint32_t stage_num,
    bool inv,
-    bool dit)
+    bool dit,
+    bool fast_tw)
  {
    NTTEngine<E, S> engine;
    stage_metadata s_meta;
@@ -353,14 +404,23 @@ namespace ntt {

    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

-    engine.loadBasicTwiddles(basic_twiddles, inv);
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
    engine.loadGlobalData16(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride) {
-      engine.loadExternalTwiddlesGeneric16(
-        external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      if (fast_tw)
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric16(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    engine.loadInternalTwiddles16(internal_twiddles, strided, inv);
+    if (fast_tw)
+      engine.loadInternalTwiddles16(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric16(internal_twiddles, strided, inv);
    engine.ntt2_4();
    engine.SharedData16Columns2_4(shmem, true, false, strided); // store
    __syncthreads();
@@ -388,8 +448,9 @@ namespace ntt {
    }
  }

+  // Generic twiddles: 1N twiddles for forward and inverse NTT
  template <typename S>
-  __global__ void generate_basic_twiddles(S basic_root, S* w6_table, S* basic_twiddles)
+  __global__ void generate_basic_twiddles_generic(S basic_root, S* w6_table, S* basic_twiddles)
  {
    S w0 = basic_root * basic_root;
    S w1 = (basic_root + w0 * basic_root) * S::inv_log_size(1);
@@ -484,7 +545,7 @@ namespace ntt {
    if (log_size > 2)
      for (int i = 0; i < 3 - (log_size > 6 ? 0 : 6 - log_size); i++)
        temp_root = temp_root * temp_root;
-    generate_basic_twiddles<<<1, 1, 0, stream>>>(temp_root, w6_table, basic_twiddles);
+    generate_basic_twiddles_generic<<<1, 1, 0, stream>>>(temp_root, w6_table, basic_twiddles);

    const int NOF_BLOCKS = (log_size >= 8) ? (1 << (log_size - 8)) : 1;
    const int NOF_THREADS = (log_size >= 8) ? 256 : (1 << log_size);
@@ -501,6 +562,100 @@ namespace ntt {
    return CHK_LAST();
  }

+  // Fast-twiddles: 2N twiddles for forward, 2N for inverse
+  template <typename S>
+  __global__ void generate_basic_twiddles_fast_twiddles_mode(S basic_root, S* basic_twiddles)
+  {
+    S w0 = basic_root * basic_root;
+    S w1 = (basic_root + w0 * basic_root) * S::inv_log_size(1);
+    S w2 = (basic_root - w0 * basic_root) * S::inv_log_size(1);
+    basic_twiddles[0] = w0;
+    basic_twiddles[1] = w1;
+    basic_twiddles[2] = w2;
+  }
+
+  template <typename S>
+  __global__ void generate_twiddle_combinations_fast_twiddles_mode(
+    S* w6_table,
+    S* w12_table,
+    S* w18_table,
+    S* w24_table,
+    S* w30_table,
+    S* external_twiddles,
+    uint32_t log_size,
+    uint32_t prev_log_size)
+  {
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t exp = ((tid & ((1 << prev_log_size) - 1)) * (tid >> prev_log_size)) << (30 - log_size);
+    S w6, w12, w18, w24, w30;
+    w6 = w6_table[exp >> 24];
+    w12 = w12_table[((exp >> 18) & 0x3f)];
+    w18 = w18_table[((exp >> 12) & 0x3f)];
+    w24 = w24_table[((exp >> 6) & 0x3f)];
+    w30 = w30_table[(exp & 0x3f)];
+    S t = w6 * w12 * w18 * w24 * w30;
+    external_twiddles[tid + (1 << log_size) - 1] = t;
+  }
+
+  template <typename S>
+  cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    S* w6_table;
+    S* w12_table;
+    S* w18_table;
+    S* w24_table;
+    S* w30_table;
+    CHK_IF_RETURN(cudaMallocAsync(&w6_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w12_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w18_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w24_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w30_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&basic_twiddles, 3 * sizeof(S), stream));
+
+    S temp_root = basic_root;
+    generate_base_table<<<1, 1, 0, stream>>>(basic_root, w30_table, 1 << (30 - log_size));
+    if (log_size > 24)
+      for (int i = 0; i < 6 - (30 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w24_table, 1 << (log_size > 24 ? 0 : 24 - log_size));
+    if (log_size > 18)
+      for (int i = 0; i < 6 - (log_size > 24 ? 0 : 24 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w18_table, 1 << (log_size > 18 ? 0 : 18 - log_size));
+    if (log_size > 12)
+      for (int i = 0; i < 6 - (log_size > 18 ? 0 : 18 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w12_table, 1 << (log_size > 12 ? 0 : 12 - log_size));
+    if (log_size > 6)
+      for (int i = 0; i < 6 - (log_size > 12 ? 0 : 12 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w6_table, 1 << (log_size > 6 ? 0 : 6 - log_size));
+    for (int i = 0; i < 3 - (log_size > 6 ? 0 : 6 - log_size); i++)
+      temp_root = temp_root * temp_root;
+    generate_basic_twiddles_fast_twiddles_mode<<<1, 1, 0, stream>>>(temp_root, basic_twiddles);
+
+    for (int i = 8; i < log_size + 1; i++) {
+      generate_twiddle_combinations_fast_twiddles_mode<<<1 << (i - 8), 256, 0, stream>>>(
+        w6_table, w12_table, w18_table, w24_table, w30_table, external_twiddles, i, STAGE_PREV_SIZES[i]);
+    }
+    internal_twiddles = w6_table;
+
+    CHK_IF_RETURN(cudaFreeAsync(w12_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w18_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w24_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w30_table, stream));
+
+    return CHK_LAST();
+  }
+
  template <typename E, typename S>
  cudaError_t large_ntt(
    E* in,
@@ -514,6 +669,7 @@ namespace ntt {
    bool inv,
    bool normalize,
    bool dit,
+    bool fast_tw,
    cudaStream_t cuda_stream)
  {
    CHK_INIT_IF_RETURN();
@@ -529,11 +685,11 @@ namespace ntt {
      if (dit) {
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
-          false, 0, inv, dit);
+          false, 0, inv, dit, fast_tw);
      } else { // dif
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
-          false, 0, inv, dit);
+          false, 0, inv, dit, fast_tw);
      }
      if (normalize) normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4));
      return CHK_LAST();
@@ -545,11 +701,11 @@ namespace ntt {
      if (dit) {
        ntt32dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
-          false, 0, inv, dit);
+          false, 0, inv, dit, fast_tw);
      } else { // dif
        ntt32<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
-          false, 0, inv, dit);
+          false, 0, inv, dit, fast_tw);
      }
      if (normalize) normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5));
      return CHK_LAST();
@@ -560,7 +716,7 @@ namespace ntt {
      const int NOF_BLOCKS = (8 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
      ntt64<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
        in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
-        false, 0, inv, dit);
+        false, 0, inv, dit, fast_tw);
      if (normalize) normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6));
      return CHK_LAST();
    }
@@ -571,17 +727,17 @@ namespace ntt {
      if (dit) {
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit);
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit);
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
      } else { // dif
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit);
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit);
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
      }
      if (normalize) normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8));
      return CHK_LAST();
@@ -591,43 +747,49 @@ namespace ntt {
    uint32_t nof_blocks = (1 << (log_size - 9)) * batch_size;
    if (dit) {
      for (int i = 0; i < 5; i++) {
-        uint32_t stage_size = STAGE_SIZES_HOST[log_size][i];
+        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
        uint32_t stride_log = 0;
        for (int j = 0; j < i; j++)
-          stride_log += STAGE_SIZES_HOST[log_size][j];
+          stride_log += fast_tw ? STAGE_SIZES_HOST_FT[log_size][j] : STAGE_SIZES_HOST[log_size][j];
        if (stage_size == 6)
          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 5)
          ntt32dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 4)
          ntt16dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
      }
    } else { // dif
      bool first_run = false, prev_stage = false;
      for (int i = 4; i >= 0; i--) {
-        uint32_t stage_size = STAGE_SIZES_HOST[log_size][i];
+        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
        uint32_t stride_log = 0;
        for (int j = 0; j < i; j++)
-          stride_log += STAGE_SIZES_HOST[log_size][j];
+          stride_log += fast_tw ? STAGE_SIZES_HOST_FT[log_size][j] : STAGE_SIZES_HOST[log_size][j];
        first_run = stage_size && !prev_stage;
        if (stage_size == 6)
          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 5)
          ntt32<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 4)
          ntt16<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit);
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        prev_stage = stage_size;
      }
    }
@@ -648,6 +810,7 @@ namespace ntt {
    int max_logn,
    int batch_size,
    bool is_inverse,
+    bool fast_tw,
    Ordering ordering,
    S* arbitrary_coset,
    int coset_gen_index,
@@ -706,10 +869,10 @@ namespace ntt {
      const bool is_reverse_in_place = (d_input == d_output);
      if (is_reverse_in_place) {
        reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-          d_output, logn, dit, reverse_input, is_normalize, S::inv_log_size(logn));
+          d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
      } else {
        reorder_digits_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-          d_input, d_output, logn, dit, reverse_input, is_normalize, S::inv_log_size(logn));
+          d_input, d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
      }
      is_normalize = false;
      d_input = d_output;
@@ -718,11 +881,11 @@ namespace ntt {
    // inplace ntt
    CHK_IF_RETURN(large_ntt(
      d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size, is_inverse,
-      (is_normalize && reverse_output == eRevType::None), dit, cuda_stream));
+      (is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));

    if (reverse_output != eRevType::None) {
      reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-        d_output, logn, dit, reverse_output, is_normalize, S::inv_log_size(logn));
+        d_output, logn, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
    }

    if (is_on_coset && is_inverse) {
@@ -743,6 +906,14 @@ namespace ntt {
    uint32_t log_size,
    cudaStream_t& stream);

+  template cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const curve_config::scalar_t& basic_root,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t*& internal_twiddles,
+    curve_config::scalar_t*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
  template cudaError_t mixed_radix_ntt<curve_config::scalar_t, curve_config::scalar_t>(
    curve_config::scalar_t* d_input,
    curve_config::scalar_t* d_output,
@@ -753,6 +924,7 @@ namespace ntt {
    int max_logn,
    int batch_size,
    bool is_inverse,
+    bool fast_tw,
    Ordering ordering,
    curve_config::scalar_t* arbitrary_coset,
    int coset_gen_index,
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -370,14 +370,23 @@ namespace ntt {
    int max_size = 0;
    int max_log_size = 0;
    S* twiddles = nullptr;
+    bool initialized = false; // protection for multi-threaded case
    std::unordered_map<S, int> coset_index = {};

    S* internal_twiddles = nullptr; // required by mixed-radix NTT
    S* basic_twiddles = nullptr;    // required by mixed-radix NTT

+    // mixed-radix NTT supports a fast-twiddle option at the cost of additional 4N memory (where N is max NTT size)
+    S* fast_external_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles = nullptr;        // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_external_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles_inv = nullptr;    // required by mixed-radix NTT (fast-twiddles mode)
+
  public:
    template <typename U>
-    friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx);
+    friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);

    cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);

@@ -389,7 +398,7 @@ namespace ntt {
  static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};

  template <typename S>
-  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx)
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
  {
    CHK_INIT_IF_RETURN();

@@ -399,11 +408,11 @@ namespace ntt {
    // please note that this offers just basic thread-safety,
    // it's assumed a singleton (non-enforced) that is supposed
    // to be initialized once per device per program lifetime
-    if (!domain.twiddles) {
+    if (!domain.initialized) {
      // Mutex is automatically released when lock goes out of scope, even in case of exceptions
      std::lock_guard<std::mutex> lock(Domain<S>::device_domain_mutex);
      // double check locking
-      if (domain.twiddles) return CHK_LAST(); // another thread is already initializing the domain
+      if (domain.initialized) return CHK_LAST(); // another thread is already initializing the domain

      bool found_logn = false;
      S omega = primitive_root;
@@ -430,6 +439,25 @@ namespace ntt {
      CHK_IF_RETURN(generate_external_twiddles_generic(
        primitive_root, domain.twiddles, domain.internal_twiddles, domain.basic_twiddles, domain.max_log_size,
        ctx.stream));
+
+      if (fast_twiddles_mode) {
+        // generating fast-twiddles (note that this cost 4N additional memory)
+        CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles, domain.max_size * sizeof(S) * 2, ctx.stream));
+        CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles_inv, domain.max_size * sizeof(S) * 2, ctx.stream));
+
+        // fast-twiddles forward NTT
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
+          primitive_root, domain.fast_external_twiddles, domain.fast_internal_twiddles, domain.fast_basic_twiddles,
+          domain.max_log_size, ctx.stream));
+
+        // fast-twiddles inverse NTT
+        S primitive_root_inv;
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          &primitive_root_inv, &domain.twiddles[domain.max_size - 1], sizeof(S), cudaMemcpyDeviceToHost, ctx.stream));
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
+          primitive_root_inv, domain.fast_external_twiddles_inv, domain.fast_internal_twiddles_inv,
+          domain.fast_basic_twiddles_inv, domain.max_log_size, ctx.stream));
+      }
      CHK_IF_RETURN(cudaStreamSynchronize(ctx.stream));

      const bool is_map_only_powers_of_primitive_root = true;
@@ -447,6 +475,7 @@ namespace ntt {
          domain.coset_index[domain.twiddles[i]] = i;
        }
      }
+      domain.initialized = true;
    }

    return CHK_LAST();
@@ -467,6 +496,19 @@ namespace ntt {
    basic_twiddles = nullptr;
    coset_index.clear();

+    cudaFreeAsync(fast_external_twiddles, ctx.stream);
+    fast_external_twiddles = nullptr;
+    cudaFreeAsync(fast_internal_twiddles, ctx.stream);
+    fast_internal_twiddles = nullptr;
+    cudaFreeAsync(fast_basic_twiddles, ctx.stream);
+    fast_basic_twiddles = nullptr;
+    cudaFreeAsync(fast_external_twiddles_inv, ctx.stream);
+    fast_external_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_internal_twiddles_inv, ctx.stream);
+    fast_internal_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_basic_twiddles_inv, ctx.stream);
+    fast_basic_twiddles_inv = nullptr;
+
    return CHK_LAST();
  }

@@ -607,9 +649,21 @@ namespace ntt {
        d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
        coset_index, stream));
    } else {
+      const bool is_on_coset = (coset_index != 0) || coset;
+      const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr) && !is_on_coset;
+      S* twiddles = is_fast_twiddles_enabled
+                      ? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
+                      : domain.twiddles;
+      S* internal_twiddles = is_fast_twiddles_enabled
+                               ? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
+                               : domain.internal_twiddles;
+      S* basic_twiddles = is_fast_twiddles_enabled
+                            ? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
+                            : domain.basic_twiddles;
+
      CHK_IF_RETURN(ntt::mixed_radix_ntt(
-        d_input, d_output, domain.twiddles, domain.internal_twiddles, domain.basic_twiddles, size, domain.max_log_size,
-        batch_size, is_inverse, config.ordering, coset, coset_index, stream));
+        d_input, d_output, twiddles, internal_twiddles, basic_twiddles, size, domain.max_log_size, batch_size,
+        is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index, stream));
    }

    if (!are_outputs_on_device)
@@ -645,10 +699,10 @@ namespace ntt {
   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
   */
-  extern "C" cudaError_t
-  CONCAT_EXPAND(CURVE, InitializeDomain)(curve_config::scalar_t primitive_root, device_context::DeviceContext& ctx)
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitializeDomain)(
+    curve_config::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
  {
-    return InitDomain(primitive_root, ctx);
+    return InitDomain(*primitive_root, ctx, fast_twiddles_mode);
  }

  /**
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -32,10 +32,13 @@ namespace ntt {
   * @param primitive_root Primitive root in field `S` of order \f$ 2^s \f$. This should be the smallest power-of-2
   * order that's large enough to support any NTT you might want to perform.
   * @param ctx Details related to the device such as its id and stream id.
+   * @param fast_twiddles_mode A mode where more memory is allocated for twiddle factors in exchange for faster compute.
+   * In this mode need additional 4N memory when N is the largest NTT size to be supported (which is derived by the
+   * primitive_root).
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename S>
-  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx);
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

  /**
   * @enum NTTDir
--- a/icicle/appUtils/ntt/ntt_impl.cuh
+++ b/icicle/appUtils/ntt/ntt_impl.cuh
@@ -16,6 +16,15 @@ namespace ntt {
    uint32_t log_size,
    cudaStream_t& stream);

+  template <typename S>
+  cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
  template <typename E, typename S>
  cudaError_t mixed_radix_ntt(
    E* d_input,
@@ -27,6 +36,7 @@ namespace ntt {
    int max_logn,
    int batch_size,
    bool is_inverse,
+    bool fast_tw,
    Ordering ordering,
    S* arbitrary_coset,
    int coset_gen_index,
--- a/Show More
+++ b/Show More