conditional PIC

Improved modular multiplier (#289 )
Out implementation of Barrett modular multiplication improved by utilising Karatsuba multiplication and more careful optimisations of lsb and msb multipliers in reduction stage
2026-01-12 00:47:59 -05:00 · 2024-01-04 23:43:29 +01:00 · 2023-12-05 13:11:44 +02:00 · 2023-11-28 14:43:29 +02:00 · 2023-11-26 14:10:30 +02:00 · 2023-11-26 14:09:58 +02:00
288 changed files with 43446 additions and 22106 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,38 @@
+Language: Cpp
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveMacros: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterFunction: true
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: true
+ColumnLimit: 120
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DisableFormat: false
+IndentFunctionDeclarationAfterType: false
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: false
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: All
+PointerAlignment: Left
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+Standard: c++17
+UseTab: Never
--- a/.github/ISSUE_TEMPLATE/bug_issue.md
+++ b/.github/ISSUE_TEMPLATE/bug_issue.md
@@ -2,7 +2,7 @@
 name: ":bug: Bug Report"
 about: Create a bug report to help us improve the repo
 title: "[BUG]: "
-labels: bug
+labels: type:bug
 ---

 ## Description
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,7 +2,7 @@
 name: ":sparkles: Feature Request"
 about: Request the inclusion of a new feature or functionality
 title: "[FEAT]: "
-labels: enhancement
+labels: type:feature
 ---

 ## Description
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -0,0 +1,14 @@
+golang:
+  - goicicle/**/*.go'
+  - go.mod
+rust:
+  - src/**/*.rs
+  - build.rs
+  - Cargo.toml
+cpp:
+  - icicle/**/*.cu
+  - icicle/**/*.cuh
+  - icicle/**/*.cpp
+  - icicle/**/*.hpp
+  - icicle/**/*.c
+  - icicle/**/*.h
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,49 +0,0 @@
-name: Build
-
-on: 
-  pull_request:
-    branches:
-      - "main"
-      - "dev"
-    paths:
-      - "icicle/**"
-      - "src/**"
-      - "Cargo.toml"
-      - "build.rs"
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: sm_70
-  DEFAULT_STREAM: per-thread
-
-jobs:
-  build-linux:
-    runs-on: ubuntu-latest
-
-    steps:
-    # Checkout code
-    - uses: actions/checkout@v3
-    # Download (or from cache) and install CUDA Toolkit 12.1.0
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-      # Build from cargo - Rust utils are preinstalled on latest images
-      # https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
-    - name: Build
-      run: cargo build --release --verbose
-      
-  
-  build-windows:
-    runs-on: windows-latest
-
-    steps:     
-    - uses: actions/checkout@v3
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-    - name: Build
-      run: cargo build --release --verbose
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -0,0 +1,104 @@
+name: Build
+
+on:
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  build-rust-linux:
+    name: Build Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build Rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: cargo build --release --verbose
+
+  build-rust-windows:
+    name: Build Rust on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build Rust Targets
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}
+      run: cargo build --release --verbose
+
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: make all
+      working-directory: ./goicicle
+
+  # TODO: Add once Golang make file supports building for Windows
+  # build-golang-windows:
+  #   name: Build Golang on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust"]'
+  #   - name: Build cpp libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -0,0 +1,41 @@
+name: Format
+
+on:
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  formatting-rust:
+    name: Check Rust Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check rustfmt
+      run: if [[ $(cargo fmt --check) ]]; then echo "Please run cargo fmt"; exit 1; fi
+    # - name: Check clippy
+    #   run: cargo clippy --no-deps --all-features --all-targets
+
+  formatting-golang:
+    name: Check Golang Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check gofmt
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  formatting-cpp-cuda:
+    name: Check C++/CUDA Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check clang-format
+      run: unformatted_files=$(find ./ -path ./icicle/build -prune -o -path ./target -prune -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file >&2); if [[ $unformatted_files ]]; then echo $unformatted_files; echo "Please run clang-format"; exit 1; fi
+        
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -0,0 +1,89 @@
+name: Test
+
+on:
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  test-rust-linux:
+    name: Test Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Run Rust Tests
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: cargo test --release --verbose -- --test-threads=1
+
+  test-cpp-linux:
+    name: Test C++ on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build C++
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build
+        cmake -DBUILD_TESTS=ON -S . -B build
+        cmake --build build
+    - name: Run C++ Tests
+      working-directory: ./icicle/build
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./goicicle
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: make libbn254.so
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
+        go test ./goicicle/curves/bn254 -count=1
+      # TODO: Fix tests for bls12377
+      # TODO: Fix tests for bls12381
+      # run: go test ./goicicle/curves/bn254 ./goicicle/curves/bls12377 ./goicicle/curves/bls12381 -count=1
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,9 @@
 *.cubin
 *.bin
 *.fatbin
+*.so
+*.nsys-rep
+*.ncu-rep
 **/target
 **/.vscode
 **/.*lock*csv#
--- a/.rustfmt.toml
+++ b/.rustfmt.toml
@@ -0,0 +1,10 @@
+# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md
+
+# Stable Configs
+chain_width = 0
+max_width = 120
+merge_derives = true
+use_field_init_shorthand = true
+use_try_shorthand = true
+
+# Unstable Configs
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Ingonyama"
+title: "Icicle: GPU Library for ZK Acceleration"
+version: 0.1.0
+date-released: 2023-03-08
+url: "https://github.com/ingonyama-zk/icicle"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,50 @@
-[workspace]
+[package]
 name = "icicle"
 version = "0.1.0"
 edition = "2021"
+authors = [ "Ingonyama" ]
+description = "An implementation of the Ingonyama CUDA Library"
+homepage = "https://www.ingonyama.com"
+repository = "https://github.com/ingonyama-zk/icicle"

-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[[bench]]
+name = "ntt"
+path = "benches/ntt.rs"
+harness = false

-members = ["icicle-core", "bls12-381", "bls12-377", "bn254"]
+[[bench]]
+name = "msm"
+path = "benches/msm.rs"
+harness = false

+[dependencies] 
+hex = "*"
+ark-std = "0.3.0"
+ark-ff = "0.3.0"
+ark-poly = "0.3.0"
+ark-ec = { version = "0.3.0", features = [ "parallel" ] }
+ark-bls12-381 = "0.3.0"
+ark-bls12-377 = "0.3.0"
+ark-bn254 = "0.3.0"
+
+serde = { version = "1.0", features = ["derive"] }
+serde_derive = "1.0"
+serde_cbor = "0.11.2"
+
+rustacuda = "0.1"
+rustacuda_core = "0.1"
+rustacuda_derive = "0.1"
+
+rand = "0.8.5" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
+
+[build-dependencies]
+cc = { version = "1.0", features = ["parallel"] }
+cmake = "0.1.50"
+
+[dev-dependencies]
+"criterion" = "0.4.0"
+
+[features]
+default = ["bls12_381"]
+bls12_381 = ["ark-bls12-381/curve"]
+g2 = []
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Golang
+ENV GOLANG_VERSION 1.21.1
+RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -1,9 +1,20 @@
 # ICICLE
- <div align="center">Icicle is a library for ZK acceleration using CUDA-enabled GPUs.</div>
+ **<div align="center">Icicle is a library for ZK acceleration using CUDA-enabled GPUs.</div>**

                  
 ![image (4)](https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png)

+
+<p align="center">
+  <img src="https://github.com/ingonyama-zk/icicle/actions/workflows/main-build.yml/badge.svg" alt="Build status">
+  <a href="https://discord.gg/EVVXTdt6DF">
+    <img src="https://img.shields.io/discord/1063033227788423299?logo=discord" alt="Chat with us on Discord">
+  </a>
+  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
+    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
+  </a>
+</p>
+
 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
@@ -25,10 +36,22 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE
    - [BLS12-381]
    - [BLS12-377]
    - [BN254]
+    - [BW6-671]

 ## Build and usage

-> NOTE: [NVCC] is a prerequisite for building.
+
+### Prerequisites
+
+- [NVCC] (version 12.0 or newer)
+- cmake 3.18 and above
+- follow [these instructions](https://github.com/ingonyama-zk/icicle/tree/main/icicle#prerequisites-on-ubuntu)
+- Any Nvidia GPU
+
+If you don't have access to a Nvidia GPU check out [google-colab](#google-colab). If you require more compute power and are looking to build or do research with ICICLE refer to our [grant program][GRANT_PROGRAM].
+
+
+### Steps

 1. Define or select a curve for your application; we've provided a [template][CRV_TEMPLATE] for defining a curve
 2. Include the curve in [`curve_config.cuh`][CRV_CONFIG]
@@ -36,13 +59,21 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE

 ```sh
 mkdir -p build
-nvcc -o build/<ENTER_DIR_NAME> ./icicle/appUtils/ntt/ntt.cu ./icicle/appUtils/msm/msm.cu ./icicle/appUtils/vector_manipulation/ve_mod_mult.cu ./icicle/primitives/projective.cu -lib -arch=native
+nvcc -o build/<binary_name> ./icicle/curves/index.cu -lib -arch=native
 ```

 ### Testing the CUDA code

 We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:

+For testing, ensure the `BUILD_TESTS` option is enabled in cmake. If not, toggle it on by adding `-DBUILD_TESTS=ON` in the cmake configuration command:
+
+```sh
+cmake -S . -B build -DBUILD_TESTS=ON
+```
+
+Proceed with the following commands:
+
 ```sh
 mkdir -p build
 cmake -S . -B build
@@ -50,6 +81,9 @@ cmake --build build
 cd build && ctest
 ```

+NOTE: If you are using cmake versions < 3.24 add `-DCUDA_ARCH=<target_cumpute_arch>` to the command `cmake -S . -B build`
+
+
 ### Rust Bindings

 For convenience, we also provide rust bindings to the ICICLE library for the following primitives:
@@ -64,7 +98,7 @@ For convenience, we also provide rust bindings to the ICICLE library for the fol
 - Scalar Vector Multiplication
 - Point Vector Multiplication

-A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environement variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.
+A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environment variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.

 > NOTE: A GPU must be detectable and therefore installed if the `ARCH_TYPE` is not set.

@@ -95,56 +129,113 @@ Supporting additional curves can be done as follows:

 Create a JSON file with the curve parameters. The curve is defined by the following parameters: 
 - ``curve_name`` - e.g. ``bls12_381``.
- ``modolus_p`` - scalar field modolus (in decimal).
- ``bit_count_p`` - number of bits needed to represent `` modolus_p`` .
- ``limb_p`` - number of bytes needed to represent `` modolus_p``  (rounded).
- ``ntt_size`` - log of the maximal size subgroup of the scalar field.    
- ``modolus_q`` - base field modulus (in decimal).
- ``bit_count_q`` - number of bits needed to represent `` modolus_q`` .
- ``limb_q`` number of bytes needed to represent `` modolus_p``  (rounded).
- ``weierstrass_b`` - Weierstrauss constant of the curve. 
- ``gen_x`` - x-value of a generator element for the curve. 
- ``gen_y`` - y-value of a generator element for the curve.
+- ``modulus_p`` - scalar field modulus (in decimal).
+- ``bit_count_p`` - number of bits needed to represent `` modulus_p`` .
+- ``limb_p`` - number of (32-bit) limbs needed to represent `` modulus_p`` (rounded up).
+- ``ntt_size`` - log of the maximal size subgroup of the scalar field.
+- ``modulus_q`` - base field modulus (in decimal).
+- ``bit_count_q`` - number of bits needed to represent `` modulus_q`` .
+- ``limb_q`` - number of (32-bit) limbs needed to represent `` modulus_q`` (rounded up).
+- ``weierstrass_b`` - `b` of the curve in Weierstrauss form.
+- ``weierstrass_b_g2_re`` - real part of the `b` value in of the g2 curve in Weierstrass form.
+- ``weierstrass_b_g2_im`` - imaginary part of the `b` value in of the g2 curve in Weierstrass form.
+- ``gen_x`` - `x` coordinate of a generator element for the curve.
+- ``gen_y`` - `y` coordinate of a generator element for the curve.
+- ``gen_x_re`` - real part of the `x` coordinate of generator element for the g2 curve.
+- ``gen_x_im`` - imaginary part of the `x` coordinate of generator element for the g2 curve.
+- ``gen_y_re`` - real part of the `y` coordinate of generator element for the g2 curve.
+- ``gen_y_im`` - imaginary part of the `y` coordinate of generator element for the g2 curve.
+- ``nonresidue`` - nonresidue, or `i^2`, or `u^2` - square of the element that generates quadratic extension field of the base field.

 Here's an example for BLS12-381.
 ```
 {
    "curve_name" : "bls12_381", 
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
+    "modulus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
    "bit_count_p" : 255,
    "limb_p" :  8,
    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
+    "modulus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
    "bit_count_q" : 381,
    "limb_q" : 12,
    "weierstrass_b" : 4,
+    "weierstrass_b_g2_re" : 4,
+    "weierstrass_b_g2_im" : 4,
    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
+    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569,
+    "gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
+    "gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758,
+    "gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905,
+    "gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582,
+    "nonresidue" : -1
 }
 ```

-Save the parameters JSON file in ``curve_parameters``.
+Save the parameters JSON file under the ``curve_parameters`` directory.

-Then run the Python script ``new_curve_script.py `` from the main icicle folder:
+Then run the Python script ``new_curve_script.py `` from the root folder:

 ```
-python3 ./curve_parameters/new_curve_script_rust.py ./curve_parameters/bls12_381.json
+python3 ./curve_parameters/new_curve_script.py ./curve_parameters/bls12_381.json
 ```

 The script does the following:
 - Creates a folder in ``icicle/curves`` with the curve name, which contains all of the files needed for the supported operations in cuda.
- Adds the curve exported operations to ``icicle/curves/index.cu``. 
+- Adds the curve's exported operations to ``icicle/curves/index.cu``. 
 - Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. 
 - Creates a test file with the curve name in ``src``. 

+Also files from ``./icicle/curves/<curve_name>/supported_operations.cu`` should be added individually to ``add_library`` section of [``./icicle/CMakeLists.txt``][CMAKELISTS]
+
 Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``).
+
+## Docker
+
+We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.
+
+```
+docker build -t <name_of_your_choice> .
+docker run --gpus all -it <name_of_your_choice> /bin/bash
+```
+
+## Google Colab
+
+[Colab](https://colab.google/) is a hosted Jupyter Notebook service that requires no setup to use and provides free access to computing resources including GPUS!
+
+You can easily run ICICLE in Google Colab on a free GPU instance, this is a great option for those who want to get started with ICICLE instantly without any local setup or GPU. 
+
+Follow this [guide][GOOGLE_COLAB_ICICLE] for more details.
+
 ## Contributions

-Join our [Discord Server](https://discord.gg/Y4SkbDf2Ff) and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+
+### Development Contributions
+
+If you are changing code, please make sure to change your [git hooks path][HOOKS_DOCS] to the repo's [hooks directory][HOOKS_PATH] by running the following command:
+
+```sh
+git config core.hooksPath ./scripts/hooks
+```
+
+In case `clang-format` is missing on your system, you can install it  using the following command:
+
+```sh
+sudo apt install clang-format
+```
+
+This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.

 ### Hall of Fame

- [Robik](https://github.com/robik75), for his on-going support and mentorship 
+- [Robik](https://github.com/robik75), for his ongoing support and mentorship
+- [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+
+## Help & Support
+
+For help and support talk to our devs in our discord channel ["ICICLE"](https://discord.gg/EVVXTdt6DF) 
+

 ## License

@@ -153,13 +244,22 @@ ICICLE is distributed under the terms of the MIT License.
 See [LICENSE-MIT][LMIT] for details.

 <!-- Begin Links -->
-[BLS12-381]: ./icicle/curves/bls12_381.cuh
+[BLS12-381]: ./icicle/curves/bls12_381/supported_operations.cu
+[BLS12-377]: ./icicle/curves/bls12_377/supported_operations.cu
+[BN254]: ./icicle/curves/bn254/supported_operations.cu
+[BW6-671]: ./icicle/curves/bw6_671/supported_operations.cu
 [NVCC]: https://docs.nvidia.com/cuda/#installation-guides
-[CRV_TEMPLATE]: ./icicle/curves/curve_template.cuh
-[CRV_CONFIG]: ./icicle/curves/curve_config.cuh
+[CRV_TEMPLATE]: ./icicle/curves/curve_template/
+[CRV_CONFIG]: ./icicle/curves/index.cu
 [B_SCRIPT]: ./build.rs
 [FDI]: https://github.com/ingonyama-zk/fast-danksharding
 [LMIT]: ./LICENSE
+[DISCORD]: https://discord.gg/Y4SkbDf2Ff
 [googletest]: https://github.com/google/googletest/
+[HOOKS_DOCS]: https://git-scm.com/docs/githooks
+[HOOKS_PATH]: ./scripts/hooks/
+[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
+[GOOGLE_COLAB_ICICLE]: https://github.com/gkigiermo/rust-cuda-colab
+[GRANT_PROGRAM]: https://docs.google.com/forms/d/e/1FAIpQLSc967TnNwxZZ4akejcSi4KOUmGrEc68ZZV-FHLfo8KnP1wbpg/viewform

 <!-- End Links -->
--- a/benches/msm.rs
+++ b/benches/msm.rs
@@ -0,0 +1,54 @@
+extern crate criterion;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use icicle::test_bls12_381::{commit_batch_bls12_381, generate_random_points_bls12_381, set_up_scalars_bls12_381};
+use icicle::utils::*;
+#[cfg(feature = "g2")]
+use icicle::{commit_batch_g2, field::ExtensionField};
+
+use rustacuda::prelude::*;
+
+const LOG_MSM_SIZES: [usize; 1] = [12];
+const BATCH_SIZES: [usize; 2] = [128, 256];
+
+fn bench_msm(c: &mut Criterion) {
+    let mut group = c.benchmark_group("MSM");
+    for log_msm_size in LOG_MSM_SIZES {
+        for batch_size in BATCH_SIZES {
+            let msm_size = 1 << log_msm_size;
+            let (scalars, _, _) = set_up_scalars_bls12_381(msm_size, 0, false);
+            let batch_scalars = vec![scalars; batch_size].concat();
+            let mut d_scalars = DeviceBuffer::from_slice(&batch_scalars[..]).unwrap();
+
+            let points = generate_random_points_bls12_381(msm_size, get_rng(None));
+            let batch_points = vec![points; batch_size].concat();
+            let mut d_points = DeviceBuffer::from_slice(&batch_points[..]).unwrap();
+
+            #[cfg(feature = "g2")]
+            let g2_points = generate_random_points::<ExtensionField>(msm_size, get_rng(None));
+            #[cfg(feature = "g2")]
+            let g2_batch_points = vec![g2_points; batch_size].concat();
+            #[cfg(feature = "g2")]
+            let mut d_g2_points = DeviceBuffer::from_slice(&g2_batch_points[..]).unwrap();
+
+            group
+                .sample_size(30)
+                .bench_function(
+                    &format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
+                    |b| b.iter(|| commit_batch_bls12_381(&mut d_points, &mut d_scalars, batch_size)),
+                );
+
+            #[cfg(feature = "g2")]
+            group
+                .sample_size(10)
+                .bench_function(
+                    &format!("G2 MSM of size 2^{} in batch {}", log_msm_size, batch_size),
+                    |b| b.iter(|| commit_batch_g2(&mut d_g2_points, &mut d_scalars, batch_size)),
+                );
+        }
+    }
+}
+
+criterion_group!(msm_benches, bench_msm);
+criterion_main!(msm_benches);
--- a/benches/ntt.rs
+++ b/benches/ntt.rs
@@ -0,0 +1,85 @@
+extern crate criterion;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use icicle::test_bls12_381::*;
+
+const LOG_NTT_SIZES: [usize; 3] = [20, 9, 10];
+const BATCH_SIZES: [usize; 3] = [1, 512, 1024];
+
+fn bench_ntt(c: &mut Criterion) {
+    let mut group = c.benchmark_group("NTT");
+    for log_ntt_size in LOG_NTT_SIZES {
+        for batch_size in BATCH_SIZES {
+            let ntt_size = 1 << log_ntt_size;
+
+            if ntt_size * batch_size > 1 << 25 {
+                continue;
+            }
+
+            let scalar_samples = 20;
+
+            let (_, mut d_evals, mut d_domain) = set_up_scalars_bls12_381(ntt_size * batch_size, log_ntt_size, true);
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| evaluate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size)),
+                );
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| interpolate_scalars_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size)),
+                );
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar inplace NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, false, 0)),
+                );
+
+            group
+                .sample_size(scalar_samples)
+                .bench_function(
+                    &format!("Scalar inplace iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| ntt_inplace_batch_bls12_381(&mut d_evals, &mut d_domain, batch_size, true, 0)),
+                );
+
+            drop(d_evals);
+            drop(d_domain);
+
+            if ntt_size * batch_size > 1 << 18 {
+                continue;
+            }
+
+            let point_samples = 10;
+
+            let (_, mut d_points_evals, mut d_domain) =
+                set_up_points_bls12_381(ntt_size * batch_size, log_ntt_size, true);
+
+            group
+                .sample_size(point_samples)
+                .bench_function(
+                    &format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| interpolate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size)),
+                );
+
+            group
+                .sample_size(point_samples)
+                .bench_function(
+                    &format!("EC iNTT of size 2^{} in batch {}", log_ntt_size, batch_size),
+                    |b| b.iter(|| evaluate_points_batch_bls12_381(&mut d_points_evals, &mut d_domain, batch_size)),
+                );
+
+            drop(d_points_evals);
+            drop(d_domain);
+        }
+    }
+}
+
+criterion_group!(ntt_benches, bench_ntt);
+criterion_main!(ntt_benches);
--- a/bls12-377/Cargo.toml
+++ b/bls12-377/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bls12-377"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bls12-377 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bls12-377/build.rs
+++ b/bls12-377/build.rs
@@ -1,34 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BLS12_377", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bls12-377/src/basic_structs/field.rs
+++ b/bls12-377/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bls12-377/src/basic_structs/mod.rs
+++ b/bls12-377/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bls12-377/src/basic_structs/point.rs
+++ b/bls12-377/src/basic_structs/point.rs
@@ -1,106 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bls12-377/src/basic_structs/scalar.rs
+++ b/bls12-377/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bls12-377/src/curve_structs.rs
+++ b/bls12-377/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<12> for BaseField {
-    const MODOLUS: [u32; 12] = [0x0;12];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,12>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bls12-377/src/from_cuda.rs
+++ b/bls12-377/src/from_cuda.rs
@@ -1,798 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_377::{Fq as Fq_BLS12_377, Fr as Fr_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_377::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_377::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BLS12_377::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bls12-377/src/lib.rs
+++ b/bls12-377/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bls12_377;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bls12-377/src/test_bls12_377.rs
+++ b/bls12-377/src/test_bls12_377.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_377::{Fq as Fq_BLS12_377, Fr as Fr_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BLS12_377 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BLS12_377) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BLS12_377 {
-        Fr_BLS12_377::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BLS12_377 {
-        Fr_BLS12_377::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger384) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BLS12_377 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BLS12_377 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BLS12_377::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BLS12_377) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BLS12_377 {
-        G1Affine_BLS12_377::new(Fq_BLS12_377::new(self.x.to_ark()), Fq_BLS12_377::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BLS12_377 {
-        G1Affine_BLS12_377::new(
-            Fq_BLS12_377::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_377::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BLS12_377) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bls12_377 {
-    use std::ops::Add;
-    use ark_bls12_377::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/bls12-381/Cargo.toml
+++ b/bls12-381/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bls12-381"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bls12-381 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bls12-381/build.rs
+++ b/bls12-381/build.rs
@@ -1,36 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BLS12_381", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.shared_flag(false);
-    // nvcc.static_flag(true);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bls12-381/src/basic_structs/field.rs
+++ b/bls12-381/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bls12-381/src/basic_structs/mod.rs
+++ b/bls12-381/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bls12-381/src/basic_structs/point.rs
+++ b/bls12-381/src/basic_structs/point.rs
@@ -1,106 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bls12-381/src/basic_structs/scalar.rs
+++ b/bls12-381/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bls12-381/src/curve_structs.rs
+++ b/bls12-381/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<12> for BaseField {
-    const MODOLUS: [u32; 12] = [0x0;12];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,12>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bls12-381/src/from_cuda.rs
+++ b/bls12-381/src/from_cuda.rs
@@ -1,798 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_381::{Fq as Fq_BLS12_381, Fr as Fr_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_381::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_381::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BLS12_381::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bls12-381/src/lib.rs
+++ b/bls12-381/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bls12_381;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bls12-381/src/test_bls12_381.rs
+++ b/bls12-381/src/test_bls12_381.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_381::{Fq as Fq_BLS12_381, Fr as Fr_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BLS12_381 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BLS12_381) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BLS12_381 {
-        Fr_BLS12_381::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BLS12_381 {
-        Fr_BLS12_381::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger384) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BLS12_381 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BLS12_381 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BLS12_381::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BLS12_381) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BLS12_381 {
-        G1Affine_BLS12_381::new(Fq_BLS12_381::new(self.x.to_ark()), Fq_BLS12_381::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BLS12_381 {
-        G1Affine_BLS12_381::new(
-            Fq_BLS12_381::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_381::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BLS12_381) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bls12_381 {
-    use std::ops::Add;
-    use ark_bls12_381::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/bn254/Cargo.toml
+++ b/bn254/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bn254"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bn254 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bn254/build.rs
+++ b/bn254/build.rs
@@ -1,36 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BN254", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.shared_flag(false);
-    // nvcc.static_flag(true);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bn254/src/basic_structs/field.rs
+++ b/bn254/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bn254/src/basic_structs/mod.rs
+++ b/bn254/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bn254/src/basic_structs/point.rs
+++ b/bn254/src/basic_structs/point.rs
@@ -1,108 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bn254/src/basic_structs/scalar.rs
+++ b/bn254/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bn254/src/curve_structs.rs
+++ b/bn254/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<8> for BaseField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,8>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bn254/src/from_cuda.rs
+++ b/bn254/src/from_cuda.rs
@@ -1,797 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BN254::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BN254::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bn254/src/lib.rs
+++ b/bn254/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bn254;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bn254/src/test_bn254.rs
+++ b/bn254/src/test_bn254.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BN254 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BN254) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BN254 {
-        Fr_BN254::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BN254 {
-        Fr_BN254::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger256) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BN254 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BN254 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BN254::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BN254::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BN254::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BN254::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BN254) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BN254 {
-        G1Affine_BN254::new(Fq_BN254::new(self.x.to_ark()), Fq_BN254::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BN254 {
-        G1Affine_BN254::new(
-            Fq_BN254::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BN254::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BN254) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bn254 {
-    use std::ops::Add;
-    use ark_bn254::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/build.rs
+++ b/build.rs
@@ -0,0 +1,53 @@
+use std::env::{self, var};
+
+use cmake::Config;
+
+fn main() {
+    let cargo_dir = var("CARGO_MANIFEST_DIR").unwrap();
+    let profile = var("PROFILE").unwrap();
+
+    let target_output_dir = format!("{}/target/{}", cargo_dir, profile);
+    let build_output_dir = format!("{}/build", target_output_dir);
+
+    println!("cargo:rerun-if-env-changed=CXXFLAGS");
+    println!("cargo:rerun-if-changed=./icicle");
+    println!("cargo:rerun-if-changed=./target/{}", profile); // without this it ignores manual changes to build folder
+
+    let mut cmake = Config::new("./icicle");
+    cmake
+        .define("BUILD_TESTS", "OFF")
+        .out_dir(&target_output_dir)
+        .build_target("icicle");
+
+    let target_profile: &str = if profile == "release" { "Release" } else { "Debug" };
+
+    cmake.define("CMAKE_BUILD_TYPE", target_profile);
+
+    if cfg!(feature = "g2") {
+        cmake.define("G2_DEFINED", "");
+    }
+
+    cmake.build();
+
+    if cfg!(unix) {
+        if let Ok(cuda_path) = var("CUDA_HOME") {
+            println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
+        } else {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
+    } else if cfg!(windows) {
+        let build_output_dir_cmake = format!("{}/{}", build_output_dir, target_profile);
+
+        println!("cargo:rustc-link-search={}", &build_output_dir_cmake);
+    }
+
+    println!("cargo:rustc-link-search={}", &build_output_dir);
+    println!("cargo:rustc-link-search={}", &target_output_dir);
+    println!("cargo:rustc-link-lib=ingo_icicle");
+    println!("cargo:rustc-link-lib=dylib=cuda");
+    println!("cargo:rustc-link-lib=dylib=cudart");
+
+    if cfg!(unix) {
+        println!("cargo:rustc-link-lib=dylib=stdc++");
+    }
+}
--- a/curve_parameters/bls12_377.json
+++ b/curve_parameters/bls12_377.json
@@ -1,13 +1,21 @@
 {
    "curve_name" : "bls12_377",
-    "modolus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041,
+    "modulus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041,
    "bit_count_p" : 253,
    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
+    "ntt_size" : 47,
+    "modulus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
    "bit_count_q" : 377,
    "limb_q" : 12,
+    "root_of_unity" : 8065159656716812877374967518403273466521432693661810619979959746626482506078,
    "weierstrass_b" : 1,
-    "gen_x" : 81937999373150964239938255573465948239988671502647976594219695644855304257327692006745978603320413799295628339695,
-    "gen_y" : 241266749859715473739788878240585681733927191168601896383759122102112907357779751001206799952863815012735208165030
+    "weierstrass_b_g2_re" : 0,
+    "weierstrass_b_g2_im" : 155198655607781456406391640216936120121836107652948796323930557600032281009004493664981332883744016074664192874906,
+    "g1_gen_x" : 81937999373150964239938255573465948239988671502647976594219695644855304257327692006745978603320413799295628339695,
+    "g1_gen_y" : 241266749859715473739788878240585681733927191168601896383759122102112907357779751001206799952863815012735208165030,
+    "g2_gen_x_re" : 233578398248691099356572568220835526895379068987715365179118596935057653620464273615301663571204657964920925606294,
+    "g2_gen_x_im" : 140913150380207355837477652521042157274541796891053068589147167627541651775299824604154852141315666357241556069118,
+    "g2_gen_y_re" : 63160294768292073209381361943935198908131692476676907196754037919244929611450776219210369229519898517858833747423,
+    "g2_gen_y_im" : 149157405641012693445398062341192467754805999074082136895788947234480009303640899064710353187729182149407503257491,
+    "nonresidue" : -5
 }
--- a/curve_parameters/bls12_381.json
+++ b/curve_parameters/bls12_381.json
@@ -1,13 +1,21 @@
 {
    "curve_name" : "bls12_381",
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
+    "modulus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
    "bit_count_p" : 255,
    "limb_p" :  8,
    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
+    "modulus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
    "bit_count_q" : 381,
    "limb_q" : 12,
+    "root_of_unity" : 937917089079007706106976984802249742464848817460758522850752807661925904159,
    "weierstrass_b" : 4,
-    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
+    "weierstrass_b_g2_re":4,
+    "weierstrass_b_g2_im":4,
+    "g1_gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
+    "g1_gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569,
+    "g2_gen_x_re" : 352701069587466618187139116011060144890029952792775240219908644239793785735715026873347600343865175952761926303160,
+    "g2_gen_x_im" : 3059144344244213709971259814753781636986470325476647558659373206291635324768958432433509563104347017837885763365758,
+    "g2_gen_y_re" : 1985150602287291935568054521177171638300868978215655730859378665066344726373823718423869104263333984641494340347905,
+    "g2_gen_y_im" : 927553665492332455747201965776037880757740193453592970025027978793976877002675564980949289727957565575433344219582,
+    "nonresidue" : -1
 }
--- a/curve_parameters/bn254.json
+++ b/curve_parameters/bn254.json
@@ -1,13 +1,21 @@
 {
    "curve_name" : "bn254",
-    "modolus_p" : 21888242871839275222246405745257275088548364400416034343698204186575808495617,
+    "modulus_p" : 21888242871839275222246405745257275088548364400416034343698204186575808495617,
    "bit_count_p" : 254,
    "limb_p" :  8,
-    "ntt_size" : 16,
-    "modolus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583,
+    "ntt_size" : 28,
+    "modulus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583,
    "bit_count_q" : 254,
    "limb_q" : 8,
+    "root_of_unity": 19103219067921713944291392827692070036145651957329286315305642004821462161904,
    "weierstrass_b" : 3,
-    "gen_x" : 1,
-    "gen_y" : 2
+    "weierstrass_b_g2_re" : 19485874751759354771024239261021720505790618469301721065564631296452457478373,
+    "weierstrass_b_g2_im" : 266929791119991161246907387137283842545076965332900288569378510910307636690,
+    "g1_gen_x" : 1,
+    "g1_gen_y" : 2,
+    "g2_gen_x_re" : 10857046999023057135944570762232829481370756359578518086990519993285655852781,
+    "g2_gen_x_im" : 11559732032986387107991004021392285783925812861821192530917403151452391805634,
+    "g2_gen_y_re" : 8495653923123431417604973247489272438418190587263600148770280649306958101930,
+    "g2_gen_y_im" : 4082367875863433681332203403145435568316851327593401208105741076214120093531,
+    "nonresidue" : -1
 }
--- a/curve_parameters/bw6-761.json
+++ b/curve_parameters/bw6-761.json
@@ -0,0 +1,21 @@
+{
+    "curve_name" : "bw6_761",
+    "modulus_p" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
+    "bit_count_p" : 377,
+    "limb_p" :  12,
+    "ntt_size" : 46,
+    "modulus_q" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068299,
+    "bit_count_q" : 761,
+    "limb_q" : 24,
+    "root_of_unity" : 32863578547254505029601261939868325669770508939375122462904745766352256812585773382134936404344547323199885654433,
+    "weierstrass_b" : 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068298,
+    "weierstrass_b_g2_re" : 4,
+    "weierstrass_b_g2_im" : 0,
+    "g1_gen_x" : 6238772257594679368032145693622812838779005809760824733138787810501188623461307351759238099287535516224314149266511977132140828635950940021790489507611754366317801811090811367945064510304504157188661901055903167026722666149426237,
+    "g1_gen_y" : 2101735126520897423911504562215834951148127555913367997162789335052900271653517958562461315794228241561913734371411178226936527683203879553093934185950470971848972085321797958124416462268292467002957525517188485984766314758624099,
+    "g2_gen_x_re" : 6445332910596979336035888152774071626898886139774101364933948236926875073754470830732273879639675437155036544153105017729592600560631678554299562762294743927912429096636156401171909259073181112518725201388196280039960074422214428,
+    "g2_gen_x_im" : 1,
+    "g2_gen_y_re" : 562923658089539719386922163444547387757586534741080263946953401595155211934630598999300396317104182598044793758153214972605680357108252243146746187917218885078195819486220416605630144001533548163105316661692978285266378674355041,
+    "g2_gen_y_im" : 1,
+    "nonresidue" : -1
+}
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -1,30 +1,15 @@
 import json
 import math
 import os
-from sympy.ntheory import isprime, primitive_root
-import subprocess
-import random 
+from string import Template
 import sys

-data = None
-with open(sys.argv[1]) as json_file:
-    data = json.load(json_file)

-curve_name = data["curve_name"]
-modolus_p = data["modolus_p"]
-bit_count_p = data["bit_count_p"]
-limb_p =  data["limb_p"]
-ntt_size = data["ntt_size"]
-modolus_q = data["modolus_q"]
-bit_count_q = data["bit_count_q"] 
-limb_q = data["limb_q"]
-weierstrass_b = data["weierstrass_b"]
-gen_x = data["gen_x"]
-gen_y = data["gen_y"]
+argv_list = ['thisfile', 'curve_json', 'command']
+new_curve_args = dict(zip(argv_list, sys.argv[:len(argv_list)] + [""]*(len(argv_list) - len(sys.argv))))

-
-def to_hex(val, length):
-    x = str(hex(val))[2:]
+def to_hex(val: int, length):
+    x = hex(val)[2:]
    if len(x) % 8 != 0:
        x = "0" * (8-len(x) % 8) + x
    if len(x) != length:
@@ -32,172 +17,330 @@ def to_hex(val, length):
    n = 8
    chunks = [x[i:i+n] for i in range(0, len(x), n)][::-1]
    s = ""
-    for c in chunks:
-        s += "0x" + c + ", "
-    return s
+    for c in chunks[:length // n]:
+        s += f'0x{c}, '
+        
+    return s[:-2]


-def get_root_of_unity(order: int) -> int:
-    assert (modolus_p - 1) % order == 0
-    return pow(5, (modolus_p - 1) // order, modolus_p)
+def compute_values(modulus, modulus_bit_count, limbs):
+    limb_size = 8*limbs
+    bit_size = 4*limb_size
+    modulus_ = to_hex(modulus,limb_size)
+    modulus_2 = to_hex(modulus*2,limb_size)
+    modulus_4 = to_hex(modulus*4,limb_size)
+    modulus_wide = to_hex(modulus,limb_size*2)
+    modulus_squared = to_hex(modulus*modulus,limb_size*2)
+    modulus_squared_2 = to_hex(modulus*modulus*2,limb_size*2)
+    modulus_squared_4 = to_hex(modulus*modulus*4,limb_size*2)
+    m_raw = int(math.floor(int(pow(2,2*modulus_bit_count) // modulus)))
+    m = to_hex(m_raw,limb_size)
+    one = to_hex(1,limb_size)
+    zero = to_hex(0,limb_size)
+    montgomery_r = to_hex(pow(2,bit_size,modulus),limb_size)
+    montgomery_r_inv = to_hex(pow(2,-bit_size,modulus),limb_size)

-def create_field_parameters_struct(modulus, modulus_bits_count,limbs,ntt,size,name):
-    s = " struct "+name+"{\n"
-    s += "   static constexpr unsigned limbs_count = " + str(limbs)+";\n"
-    s += "   static constexpr storage<limbs_count> modulus = {"+to_hex(modulus,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<limbs_count> modulus_2 = {"+to_hex(modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<limbs_count> modulus_4 = {"+to_hex(modulus*4,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_wide = {"+to_hex(modulus,8*limbs*2)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared = {"+to_hex(modulus*modulus,8*limbs)[:-2]+"};\n"  
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_2 = {"+to_hex(modulus*modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_4 = {"+to_hex(modulus*modulus*2*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr unsigned modulus_bits_count = "+str(modulus_bits_count)+";\n"
-    m = int(math.floor(int(pow(2,2*modulus_bits_count) // modulus)))
-    s += "   static constexpr storage<limbs_count> m = {"+ to_hex(m,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> one = {"+ to_hex(1,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> zero = {"+ to_hex(0,8*limbs)[:-2] +"};\n"
+    return (
+        modulus_,
+        modulus_2,
+        modulus_4,
+        modulus_wide,
+        modulus_squared,
+        modulus_squared_2,
+        modulus_squared_4,
+        m,
+        one,
+        zero,
+        montgomery_r,
+        montgomery_r_inv
+    )

-    if ntt:
+
+def get_fq_params(modulus, modulus_bit_count, limbs, nonresidue):
+    (
+        modulus,
+        modulus_2,
+        modulus_4,
+        modulus_wide,
+        modulus_squared,
+        modulus_squared_2,
+        modulus_squared_4,
+        m,
+        one,
+        zero,
+        montgomery_r,
+        montgomery_r_inv
+    ) = compute_values(modulus, modulus_bit_count, limbs)
+
+    limb_size = 8*limbs
+    nonresidue_is_negative = str(nonresidue < 0).lower()
+    nonresidue = abs(nonresidue)
+    return {
+        'fq_modulus': modulus,
+        'fq_modulus_2': modulus_2,
+        'fq_modulus_4': modulus_4,
+        'fq_modulus_wide': modulus_wide,
+        'fq_modulus_squared': modulus_squared,
+        'fq_modulus_squared_2': modulus_squared_2,
+        'fq_modulus_squared_4': modulus_squared_4,
+        'fq_m': m,
+        'fq_one': one,
+        'fq_zero': zero,
+        'fq_montgomery_r': montgomery_r,
+        'fq_montgomery_r_inv': montgomery_r_inv,
+        'nonresidue': nonresidue,
+        'nonresidue_is_negative': nonresidue_is_negative
+    }
+
+
+def get_fp_params(modulus, modulus_bit_count, limbs, root_of_unity, size=0):
+    (
+        modulus_,
+        modulus_2,
+        modulus_4,
+        modulus_wide,
+        modulus_squared,
+        modulus_squared_2,
+        modulus_squared_4,
+        m,
+        one,
+        zero,
+        montgomery_r,
+        montgomery_r_inv
+    ) = compute_values(modulus, modulus_bit_count, limbs)
+    limb_size = 8*limbs
+    if size > 0:
+        omega = ''
+        omega_inv = ''
+        inv = ''
+        omegas = []
+        omegas_inv = []
        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega"+str(k+1)+"= {"+ to_hex(omega,8*limbs)[:-2]+"};\n"
+            if k == 0:
+                om = root_of_unity
+            else:
+                om = pow(om, 2, modulus)
+            omegas.append(om)
+            omegas_inv.append(pow(om, -1, modulus))
+        omegas.reverse()
+        omegas_inv.reverse()
        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega_inv"+str(k+1)+"= {"+ to_hex(pow(omega, -1, modulus),8*limbs)[:-2]+"};\n"
-        for k in range(size):
-            s += "   static constexpr storage<limbs_count> inv"+str(k+1)+"= {"+ to_hex(pow(int(pow(2,k+1)), -1, modulus),8*limbs)[:-2]+"};\n"  
-    s+=" };\n"   
-    return s
+            omega += "\n              {"+ to_hex(omegas[k],limb_size)+"}," if k>0 else "      {"+ to_hex(omegas[k],limb_size)+"},"
+            omega_inv += "\n              {"+ to_hex(omegas_inv[k],limb_size)+"}," if k>0 else "      {"+ to_hex(omegas_inv[k],limb_size)+"},"
+            inv += "\n              {"+ to_hex(pow(int(pow(2,k+1)), -1, modulus),limb_size)+"}," if k>0 else "      {"+ to_hex(pow(int(pow(2,k+1)), -1, modulus),limb_size)+"},"
+  
+  
+    return {
+        'fp_modulus': modulus_,
+        'fp_modulus_2': modulus_2,
+        'fp_modulus_4': modulus_4,
+        'fp_modulus_wide': modulus_wide,
+        'fp_modulus_squared': modulus_squared,
+        'fp_modulus_squared_2': modulus_squared_2,
+        'fp_modulus_squared_4': modulus_squared_4,
+        'fp_m': m,
+        'fp_one': one,
+        'fp_zero': zero,
+        'fp_montgomery_r': montgomery_r,
+        'fp_montgomery_r_inv': montgomery_r_inv,
+        'omega': omega[:-1],
+        'omega_inv': omega_inv[:-1],
+        'inv': inv[:-1],
+    }

-def create_gen():
-    s = " struct group_generator {\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_x = {"+to_hex(gen_x,8*limb_q)[:-2]+ "};\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_y = {"+to_hex(gen_y,8*limb_q)[:-2]+ "};\n"
-    s+=" };\n" 
-    return s

-def get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b):
-    file_content = ""
-    file_content += "#pragma once\n#include \"../../utils/storage.cuh\"\n"
-    file_content += "namespace PARAMS_"+curve_name.upper()+"{\n"
-    file_content += create_field_parameters_struct(modolus_p,bit_count_p,limb_p,True,ntt_size,"fp_config")
-    file_content += create_field_parameters_struct(modolus_q,bit_count_q,limb_q,False,0,"fq_config")
-    file_content += " static constexpr unsigned weierstrass_b = " + str(weierstrass_b)+ ";\n"
-    file_content += create_gen()
-    file_content+="}\n"
-    return file_content
+def get_generators(g1_gen_x, g1_gen_y, g2_gen_x_re, g2_gen_x_im, g2_gen_y_re, g2_gen_y_im, size):

+    return {
+        'fq_gen_x': to_hex(g1_gen_x, size),
+        'fq_gen_y': to_hex(g1_gen_y, size),
+        'fq_gen_x_re': to_hex(g2_gen_x_re, size),
+        'fq_gen_x_im': to_hex(g2_gen_x_im, size),
+        'fq_gen_y_re': to_hex(g2_gen_y_re, size),
+        'fq_gen_y_im': to_hex(g2_gen_y_im, size)
+    }
+
+
+def get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, size):
+    
+    return {
+        'weier_b': to_hex(weierstrass_b, size),
+        'weier_b_g2_re': to_hex(weierstrass_b_g2_re, size),
+        'weier_b_g2_im': to_hex(weierstrass_b_g2_im, size),
+    }
+
+
+def get_params(config):
+    global ntt_size
+    curve_name = config["curve_name"]
+    modulus_p = config["modulus_p"]
+    bit_count_p = config["bit_count_p"]
+    limb_p =  config["limb_p"]
+    ntt_size = config["ntt_size"]
+    modulus_q = config["modulus_q"]
+    bit_count_q = config["bit_count_q"] 
+    limb_q = config["limb_q"]
+    root_of_unity = config["root_of_unity"]
+    nonresidue = config["nonresidue"]
+    if root_of_unity == modulus_p:
+        sys.exit("Invalid root_of_unity value; please update in curve parameters")
+
+    weierstrass_b = config["weierstrass_b"]
+    weierstrass_b_g2_re = config["weierstrass_b_g2_re"]
+    weierstrass_b_g2_im = config["weierstrass_b_g2_im"]
+    g1_gen_x = config["g1_gen_x"]
+    g1_gen_y = config["g1_gen_y"]
+    g2_generator_x_re = config["g2_gen_x_re"]
+    g2_generator_x_im = config["g2_gen_x_im"]
+    g2_generator_y_re = config["g2_gen_y_re"]
+    g2_generator_y_im = config["g2_gen_y_im"]
+
+    params = {
+        'curve_name_U': curve_name.upper(),
+        'fp_num_limbs': limb_p,
+        'fq_num_limbs': limb_q,
+        'fp_modulus_bit_count': bit_count_p,
+        'fq_modulus_bit_count': bit_count_q,
+        'num_omegas': ntt_size
+    }
+    
+    fp_params = get_fp_params(modulus_p, bit_count_p, limb_p, root_of_unity, ntt_size)
+    fq_params = get_fq_params(modulus_q, bit_count_q, limb_q, nonresidue)
+    generators = get_generators(g1_gen_x, g1_gen_y, g2_generator_x_re, g2_generator_x_im, g2_generator_y_re, g2_generator_y_im, 8*limb_q)
+    weier_params = get_weier_params(weierstrass_b, weierstrass_b_g2_re, weierstrass_b_g2_im, 8*limb_q)
+
+    return {
+        **params,
+        **fp_params,
+        **fq_params,
+        **generators,
+        **weier_params
+    }
+
+
+config = None
+with open(new_curve_args['curve_json']) as json_file:
+    config = json.load(json_file)
+
+curve_name_lower = config["curve_name"].lower()
+curve_name_upper = config["curve_name"].upper()
+limb_q = config["limb_q"]
+limb_p = config["limb_p"]

 # Create Cuda interface

-newpath = "./icicle-cuda/curves/"+curve_name 
+newpath = f'./icicle/curves/{curve_name_lower}'
 if not os.path.exists(newpath):
    os.makedirs(newpath)

-fc = get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b)
-text_file = open("./icicle-cuda/curves/"+curve_name+"/params.cuh", "w")
-n = text_file.write(fc)
-text_file.close()
+with open("./icicle/curves/curve_template/params.cuh.tmpl", "r") as params_file:
+    params_file_template = Template(params_file.read())
+    params = get_params(config)
+    params_content = params_file_template.safe_substitute(params)
+    with open(f'./icicle/curves/{curve_name_lower}/params.cuh', 'w') as f:
+        f.write(params_content)

-with open("./icicle-cuda/curves/curve_template/lde.cu", "r") as lde_file:
-    content = lde_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/lde.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open("./icicle-cuda/curves/curve_template/msm.cu", "r") as msm_file:
-    content = msm_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/msm.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
+if new_curve_args['command'] != '-update':
+    with open("./icicle/curves/curve_template/lde.cu.tmpl", "r") as lde_file:
+        template_content = Template(lde_file.read())
+        lde_content = template_content.safe_substitute(
+            CURVE_NAME_U=curve_name_upper, 
+            CURVE_NAME_L=curve_name_lower
+        )
+        with open(f'./icicle/curves/{curve_name_lower}/lde.cu', 'w') as f:
+            f.write(lde_content)
+        
+    with open("./icicle/curves/curve_template/msm.cu.tmpl", "r") as msm_file:
+        template_content = Template(msm_file.read())
+        msm_content = template_content.safe_substitute(
+            CURVE_NAME_U=curve_name_upper, 
+            CURVE_NAME_L=curve_name_lower
+        )
+        with open(f'./icicle/curves/{curve_name_lower}/msm.cu', 'w') as f:
+            f.write(msm_content)

-with open("./icicle-cuda/curves/curve_template/ve_mod_mult.cu", "r") as ve_mod_mult_file:
-    content = ve_mod_mult_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/ve_mod_mult.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
+    with open("./icicle/curves/curve_template/ve_mod_mult.cu.tmpl", "r") as ve_mod_mult_file:
+        template_content = Template(ve_mod_mult_file.read())
+        ve_mod_mult_content = template_content.safe_substitute(
+            CURVE_NAME_U=curve_name_upper, 
+            CURVE_NAME_L=curve_name_lower
+        )
+        with open(f'./icicle/curves/{curve_name_lower}/ve_mod_mult.cu', 'w') as f:
+            f.write(ve_mod_mult_content)
+        

-namespace = '#include "params.cuh"\n'+'''namespace CURVE_NAME_U {
-    typedef Field<PARAMS_CURVE_NAME_U::fp_config> scalar_field_t;\
-    typedef scalar_field_t scalar_t;\
-    typedef Field<PARAMS_CURVE_NAME_U::fq_config> point_field_t;
-    typedef Projective<point_field_t, scalar_field_t, PARAMS_CURVE_NAME_U::group_generator, PARAMS_CURVE_NAME_U::weierstrass_b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-}'''
+    with open(f'./icicle/curves/curve_template/curve_config.cuh.tmpl', 'r') as cc:
+        template_content = Template(cc.read())
+        cc_content = template_content.safe_substitute(
+            CURVE_NAME_U=curve_name_upper,
+        )
+        with open(f'./icicle/curves/{curve_name_lower}/curve_config.cuh', 'w') as f:
+            f.write(cc_content)
+        

-with open('./icicle-cuda/curves/'+curve_name+'/curve_config.cuh', 'w') as f:
-    f.write(namespace.replace("CURVE_NAME_U",curve_name.upper()))
-    
-    
-eq = '''
-#include <cuda.h>\n
-#include "curve_config.cuh"\n
-#include "../../primitives/projective.cuh"\n
-extern "C" bool eq_CURVE_NAME_L(CURVE_NAME_U::projective_t *point1, CURVE_NAME_U::projective_t *point2)
-{
-    return (*point1 == *point2);
-}'''
-
-with open('./icicle-cuda/curves/'+curve_name+'/projective.cu', 'w') as f:
-    f.write(eq.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-
-supported_operations = '''
-#include "projective.cu"
-#include "lde.cu"
-#include "msm.cu"
-#include "ve_mod_mult.cu"
-'''
-
-with open('./icicle-cuda/curves/'+curve_name+'/supported_operations.cu', 'w') as f:
-    f.write(supported_operations.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-    
-with open('./icicle-cuda/curves/index.cu', 'a') as f:
-    f.write('\n#include "'+curve_name.lower()+'/supported_operations.cu"')
-    
+    with open(f'./icicle/curves/curve_template/projective.cu.tmpl', 'r') as proj:
+        template_content = Template(proj.read())
+        proj_content = template_content.safe_substitute(
+            CURVE_NAME_U=curve_name_upper, 
+            CURVE_NAME_L=curve_name_lower
+        )
+        with open(f'./icicle/curves/{curve_name_lower}/projective.cu', 'w') as f:
+            f.write(proj_content)


-# Create Rust interface and tests
+    with open(f'./icicle/curves/curve_template/supported_operations.cu.tmpl', 'r') as supp_ops:
+        template_content = Template(supp_ops.read())
+        supp_ops_content = template_content.safe_substitute()
+        with open(f'./icicle/curves/{curve_name_lower}/supported_operations.cu', 'w') as f:
+            f.write(supp_ops_content)

-if limb_p == limb_q: 
-    with open("./src/curve_templates/curve_same_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
-        n = text_file.write(content)
-        text_file.close()
-else:
-    with open("./src/curve_templates/curve_different_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        content = content.replace("_limbs_q",str(limb_q * 8 * 4))
-        content = content.replace("limbs_q",str(limb_q))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
+
+    with open('./icicle/curves/index.cu', 'r+') as f:
+        index_text = f.read()
+        if index_text.find(curve_name_lower) == -1:
+            f.write(f'\n#include "{curve_name_lower}/supported_operations.cu"')
+        
+
+
+    # Create Rust interface and tests
+
+    if limb_p == limb_q: 
+        with open("./src/curve_templates/curve_same_limbs.rs", "r") as curve_file:
+            content = curve_file.read()
+            content = content.replace("CURVE_NAME_U",curve_name_upper)
+            content = content.replace("CURVE_NAME_L",curve_name_lower)
+            content = content.replace("_limbs_p",str(limb_p * 8 * 4))
+            content = content.replace("limbs_p",str(limb_p))
+            text_file = open("./src/curves/"+curve_name_lower+".rs", "w")
+            n = text_file.write(content)
+            text_file.close()
+    else:
+        with open("./src/curve_templates/curve_different_limbs.rs", "r") as curve_file:
+            content = curve_file.read()
+            content = content.replace("CURVE_NAME_U",curve_name_upper)
+            content = content.replace("CURVE_NAME_L",curve_name_lower)
+            content = content.replace("_limbs_p",str(limb_p * 8 * 4))
+            content = content.replace("limbs_p",str(limb_p))
+            content = content.replace("_limbs_q",str(limb_q * 8 * 4))
+            content = content.replace("limbs_q",str(limb_q))
+            text_file = open("./src/curves/"+curve_name_lower+".rs", "w")
+            n = text_file.write(content)
+            text_file.close()
+
+    with open("./src/curve_templates/test.rs", "r") as test_file:
+        content = test_file.read()
+        content = content.replace("CURVE_NAME_U",curve_name_upper)
+        content = content.replace("CURVE_NAME_L",curve_name_lower)
+        text_file = open("./src/test_"+curve_name_lower+".rs", "w")
        n = text_file.write(content)
        text_file.close()
+        
+    with open('./src/curves/mod.rs', 'r+') as f:
+        mod_text = f.read()
+        if mod_text.find(curve_name_lower) == -1:
+            f.write('\npub mod ' + curve_name_lower + ';')

-with open("./src/curve_templates/test.rs", "r") as test_file:
-    content = test_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./src/test_"+curve_name+".rs", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open('./src/curves/mod.rs', 'a') as f:
-    f.write('\n pub mod ' + curve_name + ';')
-
-with open('./src/lib.rs', 'a') as f:
-    f.write('\npub mod ' + curve_name + ';')
+    with open('./src/lib.rs', 'r+') as f:
+        lib_text = f.read()
+        if lib_text.find(curve_name_lower) == -1:
+            f.write('\npub mod ' + curve_name_lower + ';')
--- a/examples/ntt/main.rs
+++ b/examples/ntt/main.rs
@@ -0,0 +1,156 @@
+use std::time::Instant;
+
+use icicle::{curves::bls12_381::ScalarField_BLS12_381, test_bls12_381::*};
+use rustacuda::prelude::DeviceBuffer;
+
+const LOG_NTT_SIZES: [usize; 3] = [20, 10, 9];
+const BATCH_SIZES: [usize; 3] = [1, 1 << 9, 1 << 10];
+
+const MAX_POINTS_LOG2: usize = 18;
+const MAX_SCALARS_LOG2: usize = 26;
+
+fn bench_lde() {
+    for log_ntt_size in LOG_NTT_SIZES {
+        for batch_size in BATCH_SIZES {
+            let ntt_size = 1 << log_ntt_size;
+
+            fn ntt_scalars_batch_bls12_381(
+                d_inout: &mut DeviceBuffer<ScalarField_BLS12_381>,
+                d_twiddles: &mut DeviceBuffer<ScalarField_BLS12_381>,
+                batch_size: usize,
+            ) -> i32 {
+                ntt_inplace_batch_bls12_381(d_inout, d_twiddles, batch_size, false, 0);
+                0
+            }
+
+            fn intt_scalars_batch_bls12_381(
+                d_inout: &mut DeviceBuffer<ScalarField_BLS12_381>,
+                d_twiddles: &mut DeviceBuffer<ScalarField_BLS12_381>,
+                batch_size: usize,
+            ) -> i32 {
+                ntt_inplace_batch_bls12_381(d_inout, d_twiddles, batch_size, true, 0);
+                0
+            }
+
+            // copy
+            bench_ntt_template(
+                MAX_SCALARS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_scalars_bls12_381,
+                evaluate_scalars_batch_bls12_381,
+                "NTT",
+                false,
+                100,
+            );
+
+            bench_ntt_template(
+                MAX_SCALARS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_scalars_bls12_381,
+                interpolate_scalars_batch_bls12_381,
+                "iNTT",
+                true,
+                100,
+            );
+
+            bench_ntt_template(
+                MAX_POINTS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_points_bls12_381,
+                evaluate_points_batch_bls12_381,
+                "EC NTT",
+                false,
+                20,
+            );
+
+            bench_ntt_template(
+                MAX_POINTS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_points_bls12_381,
+                interpolate_points_batch_bls12_381,
+                "EC iNTT",
+                true,
+                20,
+            );
+
+            // inplace
+            bench_ntt_template(
+                MAX_SCALARS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_scalars_bls12_381,
+                ntt_scalars_batch_bls12_381,
+                "NTT inplace",
+                false,
+                100,
+            );
+
+            bench_ntt_template(
+                MAX_SCALARS_LOG2,
+                ntt_size,
+                batch_size,
+                log_ntt_size,
+                set_up_scalars_bls12_381,
+                intt_scalars_batch_bls12_381,
+                "iNTT inplace",
+                true,
+                100,
+            );
+        }
+    }
+}
+
+fn bench_ntt_template<E, S, R>(
+    log_max_size: usize,
+    ntt_size: usize,
+    batch_size: usize,
+    log_ntt_size: usize,
+    set_data: fn(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<E>, DeviceBuffer<E>, DeviceBuffer<S>),
+    bench_fn: fn(d_evaluations: &mut DeviceBuffer<E>, d_domain: &mut DeviceBuffer<S>, batch_size: usize) -> R,
+    id: &str,
+    inverse: bool,
+    samples: usize,
+) -> Option<(Vec<E>, R)> {
+    let count = ntt_size * batch_size;
+
+    let bench_id = format!("{} of size 2^{} in batch {}", id, log_ntt_size, batch_size);
+
+    if count > 1 << log_max_size {
+        println!("Bench size exceeded: {}", bench_id);
+        return None;
+    }
+
+    println!("{}", bench_id);
+
+    let (input, mut d_evals, mut d_domain) = set_data(ntt_size * batch_size, log_ntt_size, inverse);
+
+    let first = bench_fn(&mut d_evals, &mut d_domain, batch_size);
+
+    let start = Instant::now();
+    for _ in 0..samples {
+        bench_fn(&mut d_evals, &mut d_domain, batch_size);
+    }
+    let elapsed = start.elapsed();
+    println!(
+        "{} {:0?} us x {} = {:?}",
+        bench_id,
+        elapsed.as_micros() as f32 / (samples as f32),
+        samples,
+        elapsed
+    );
+
+    Some((input, first))
+}
+
+fn main() {
+    bench_lde();
+}
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,17 @@
+module github.com/ingonyama-zk/icicle
+
+go 1.20
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/kr/pretty v0.1.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
+
+require (
+	github.com/consensys/bavard v0.1.13
+	github.com/stretchr/testify v1.8.3
+	rsc.io/tmplfunc v0.0.3 // indirect
+)
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,20 @@
+github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
+github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
+github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
+rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA=
--- a/goicicle/Makefile
+++ b/goicicle/Makefile
@@ -0,0 +1,34 @@
+CUDA_ROOT_DIR = /usr/local/cuda
+NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
+CFLAGS = -Xcompiler -fPIC -std=c++17
+LDFLAGS = -shared
+FEATURES = -DG2_DEFINED
+
+TARGET_BN254 = libbn254.so
+TARGET_BW6761 = libbw6761.so
+TARGET_BLS12_381 = libbls12_381.so
+TARGET_BLS12_377 = libbls12_377.so
+
+VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761
+
+SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
+SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+
+all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
+
+$(TARGET_BN254): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
+
+$(TARGET_BW6761): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@
+
+$(TARGET_BLS12_381):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
+
+$(TARGET_BLS12_377):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
+
+clean:
+	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
--- a/goicicle/README.md
+++ b/goicicle/README.md
@@ -0,0 +1,67 @@
+# ICICLE CUDA to Golang Binding Guide
+
+This guide provides instructions on how to compile CUDA code using the provided Makefile, and then how to use the resulting shared libraries to bind Golang to ICICLE's CUDA code.
+
+## Prerequisites
+
+To compile the CUDA files, you will need:
+
+- CUDA toolkit installed. The Makefile assumes CUDA is installed in `/usr/local/cuda`. If CUDA is installed in a different location, please adjust the `CUDA_ROOT_DIR` variable accordingly.
+- A compatible GPU and corresponding driver installed on your machine.
+
+## Structure of the Makefile
+
+The Makefile is designed to compile CUDA files for four curves: BN254, BLS12_381, BLS12_377 and BW6_671. The source files are located in the `icicle/curves/` directory.
+
+## Compiling CUDA Code
+
+1. Navigate to the directory containing the Makefile in your terminal.
+2. To compile all curve libraries, use the `make all` command. This will create four shared libraries: `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`.
+3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so`, `libbls12_377.so` or `libbw6_671.so` to compile those curves instead.
+
+The resulting `.so` files are the compiled shared libraries for each curve.
+
+## Golang Binding
+
+The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+
+1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+
+2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+    // Now you can call the C functions from the ICICLE libraries.
+    // Note that C function calls are prefixed with 'C.' in Go code.
+}
+```
+
+Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+## Cleaning up
+
+If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so` files.
+
+## Common issues
+
+### Cannot find shared library
+
+In some cases you may encounter the following error, despite exporting the correct `LD_LIBRARY_PATH`.
+
+```
+/usr/local/go/pkg/tool/linux_amd64/link: running gcc failed: exit status 1
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+collect2: error: ld returned 1 exit status
+```
+
+This is normally fixed by exporting the path to the shared library location in the following way: `export CGO_LDFLAGS="-L/<path_to_shared_lib>/"`
--- a/goicicle/curves/bls12377/g1.go
+++ b/goicicle/curves/bls12377/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bls12_377(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBLS12_377
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12_377(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBLS12_377
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BLS12_377_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BLS12_377_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12_377(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12_377(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12_377(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/goicicle/curves/bls12377/g1_test.go
+++ b/goicicle/curves/bls12377/g1_test.go
@@ -0,0 +1,198 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBLS12_377One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBLS12_377Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBLS12_377ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 32)
+}
+
+func TestNewPointBLS12_377Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBLS12_377Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBLS12_377StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBLS12_377fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBLS12_377Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBLS12_377FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
+	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
--- a/goicicle/curves/bls12377/g2.go
+++ b/goicicle/curves/bls12377/g2.go
@@ -0,0 +1,102 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [6]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	X, Y ExtentionField
+}
+
+type G2Point struct {
+	X, Y, Z ExtentionField
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12_377(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
--- a/goicicle/curves/bls12377/g2_test.go
+++ b/goicicle/curves/bls12377/g2_test.go
@@ -0,0 +1,79 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
+	bytes := element.ToBytesLe()
+
+	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
--- a/goicicle/curves/bls12377/include/msm.h
+++ b/goicicle/curves/bls12377/include/msm.h
@@ -0,0 +1,98 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BLS12_377_MSM_H
+#define _BLS12_377_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bls12_377(
+  BLS12_377_projective_t* out, BLS12_377_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_377(
+  BLS12_377_projective_t* out,
+  BLS12_377_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_MSM_H */
--- a/goicicle/curves/bls12377/include/ntt.h
+++ b/goicicle/curves/bls12377/include/ntt.h
@@ -0,0 +1,195 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BLS12_377_NTT_H
+#define _BLS12_377_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+
+int ntt_cuda_bls12_377(BLS12_377_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_377(BLS12_377_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BLS12_377_scalar_t*
+build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bls12_377(BLS12_377_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_NTT_H */
--- a/goicicle/curves/bls12377/include/projective.h
+++ b/goicicle/curves/bls12377/include/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+bool projective_is_on_curve_bls12_377(BLS12_377_projective_t* point1);
+
+int random_scalar_bls12_377(BLS12_377_scalar_t* out);
+int random_projective_bls12_377(BLS12_377_projective_t* out);
+BLS12_377_projective_t* projective_zero_bls12_377();
+int projective_to_affine_bls12_377(BLS12_377_affine_t* out, BLS12_377_projective_t* point1);
+int projective_from_affine_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* point1);
+
+int random_g2_projective_bls12_377(BLS12_377_g2_projective_t* out);
+int g2_projective_to_affine_bls12_377(BLS12_377_g2_affine_t* out, BLS12_377_g2_projective_t* point1);
+int g2_projective_from_affine_bls12_377(BLS12_377_g2_projective_t* out, BLS12_377_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t* point1);
+
+bool eq_bls12_377(BLS12_377_projective_t* point1, BLS12_377_projective_t* point2);
+bool eq_g2_bls12_377(BLS12_377_g2_projective_t* point1, BLS12_377_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
--- a/goicicle/curves/bls12377/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12377/include/ve_mod_mult.h
@@ -0,0 +1,49 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BLS12_377_VEC_MULT_H
+#define _BLS12_377_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+int32_t vec_mod_mult_point_bls12_377(
+  BLS12_377_projective_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_377(
+  BLS12_377_scalar_t* matrix_flattened,
+  BLS12_377_scalar_t* input,
+  BLS12_377_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_VEC_MULT_H */
--- a/goicicle/curves/bls12377/msm.go
+++ b/goicicle/curves/bls12377/msm.go
@@ -0,0 +1,209 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "msm.h"
+import "C"
+
+func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G1ProjectivePoint, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		var p G1ProjectivePoint
+		p.SetZero()
+
+		out[i] = p
+	}
+
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G2Point, batchSize)
+
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_377_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_g2_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_377_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.msm_batch_g2_cuda_bls12_377(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bls12377/msm_test.go
+++ b/goicicle/curves/bls12377/msm_test.go
@@ -0,0 +1,360 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func GeneratePoints(count int) []G1PointAffine {
+	// Declare a slice of integers
+	var points []G1PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		var pointProjective G1ProjectivePoint
+		pointProjective.Random()
+
+		var pointAffine G1PointAffine
+		pointAffine.FromProjective(&pointProjective)
+
+		points = append(points, pointAffine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func GeneratePointsProj(count int) []G1ProjectivePoint {
+	// Declare a slice of integers
+	var points []G1ProjectivePoint
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		var p G1ProjectivePoint
+		p.Random()
+
+		points = append(points, p)
+	}
+
+	return points
+}
+
+func GenerateScalars(count int, skewed bool) []G1ScalarField {
+	// Declare a slice of integers
+	var scalars []G1ScalarField
+
+	var rand G1ScalarField
+	var zero G1ScalarField
+	var one G1ScalarField
+	var randLarge G1ScalarField
+
+	zero.SetZero()
+	one.SetOne()
+	randLarge.Random()
+
+	if skewed && count > 1_200_000 {
+		for i := 0; i < count-1_200_000; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+
+		for i := 0; i < 600_000; i++ {
+			scalars = append(scalars, randLarge)
+		}
+		for i := 0; i < 400_000; i++ {
+			scalars = append(scalars, zero)
+		}
+		for i := 0; i < 200_000; i++ {
+			scalars = append(scalars, one)
+		}
+	} else {
+		for i := 0; i < count; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+	}
+
+	return scalars[:count]
+}
+
+func TestMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G1ProjectivePoint)
+		startTime := time.Now()
+		_, e := Msm(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1<<v - 1
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := count * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G1ProjectivePoint, 1)
+		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.True(t, outHost[0].IsOnCurve())
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
+
+				if e != 0 {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestBatchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GeneratePoints(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G1ProjectivePoint)
+				_, e := Msm(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+// G2
+func GenerateG2Points(count int) []G2PointAffine {
+	// Declare a slice of integers
+	var points []G2PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		fmt.Print() // this prevents the test from hanging. TODO: figure out why
+		var p G2Point
+		p.Random()
+		var affine G2PointAffine
+		affine.FromProjective(&p)
+
+		points = append(points, affine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func TestMsmG2BLS12_377(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func BenchmarkMsmG2BLS12_377(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GenerateG2Points(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.IsOnCurve())
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GenerateG2Points(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
--- a/goicicle/curves/bls12377/ntt.go
+++ b/goicicle/curves/bls12377/ntt.go
@@ -0,0 +1,222 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "ntt.h"
+import "C"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bls12_377(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bls12_377(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bls12_377(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bls12_377(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 32
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BLS12_377_scalar_t)(dp)
+	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
+	cosetPowersC := (*C.BLS12_377_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BLS12_377_scalar_t)(scalars_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
+	coset_powersC := (*C.BLS12_377_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_g2_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
--- a/goicicle/curves/bls12377/ntt_test.go
+++ b/goicicle/curves/bls12377/ntt_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"reflect"
+	"testing"
+)
+
+func TestNttBLS12_377Batch(t *testing.T) {
+	count := 1 << 20
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatch(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_377CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestINttBLS12_377CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, true, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_377(t *testing.T) {
+	count := 1 << 3
+
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]G1ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	Ntt(&inttResult, true, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBLS12_377(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars := GenerateScalars(count*batches, false)
+
+	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]G1ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]G1ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatch(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]G1ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bls12_377 function
+		Ntt(&nttResultVecOfVec[i], false, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars := GenerateScalars(nttSize, false)
+
+			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				Ntt(&nttResult, false, 0)
+			}
+		})
+	}
+}
--- a/goicicle/curves/bls12377/utils.go
+++ b/goicicle/curves/bls12377/utils.go
@@ -0,0 +1,38 @@
+package bls12377
+
+import "encoding/binary"
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr4(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
+
+func ConvertUint64ArrToUint32Arr6(arr64 [6]uint64) [12]uint32 {
+	var arr32 [12]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
--- a/goicicle/curves/bls12377/vec_mod.go
+++ b/goicicle/curves/bls12377/vec_mod.go
@@ -0,0 +1,42 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "ve_mod_mult.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
+	scalarVec1C := (*C.BLS12_377_scalar_t)(scalarVec1)
+	scalarVec2C := (*C.BLS12_377_scalar_t)(scalarVec2)
+	sizeC := C.size_t(size)
+
+	ret := C.vec_mod_mult_device_scalar_bls12_377(scalarVec1C, scalarVec2C, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error multiplying scalar vectors")
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bls12381/g1.go
+++ b/goicicle/curves/bls12381/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_381_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bls12_381(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBLS12_381
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBLS12_381 to *C.BLS12_381_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_381_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12_381(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bls12_381(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bls12_381(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBLS12_381
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BLS12_381_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BLS12_381_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12_381(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12_381(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12_381(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/goicicle/curves/bls12381/g1_test.go
+++ b/goicicle/curves/bls12381/g1_test.go
@@ -0,0 +1,198 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBLS12_381One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBLS12_381Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBLS12_381ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 32)
+}
+
+func TestNewPointBLS12_381Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBLS12_381Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBLS12_381StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBLS12_381fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBLS12_381Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBLS12_381FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
+	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
--- a/goicicle/curves/bls12381/g2.go
+++ b/goicicle/curves/bls12381/g2.go
@@ -0,0 +1,102 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [6]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	X, Y ExtentionField
+}
+
+type G2Point struct {
+	X, Y, Z ExtentionField
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bls12_381(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBLS12_381 to *C.BLS12_381_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12_381(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bls12_381(point)
+
+	return bool(res)
+}
--- a/goicicle/curves/bls12381/g2_test.go
+++ b/goicicle/curves/bls12381/g2_test.go
@@ -0,0 +1,79 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
+	bytes := element.ToBytesLe()
+
+	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
--- a/goicicle/curves/bls12381/include/msm.h
+++ b/goicicle/curves/bls12381/include/msm.h
@@ -0,0 +1,98 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BLS12_381_MSM_H
+#define _BLS12_381_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_381 projective and affine structs
+typedef struct BLS12_381_projective_t BLS12_381_projective_t;
+typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
+typedef struct BLS12_381_affine_t BLS12_381_affine_t;
+typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
+typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bls12_381(
+  BLS12_381_projective_t* out, BLS12_381_affine_t* points, BLS12_381_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_381(
+  BLS12_381_projective_t* out,
+  BLS12_381_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* out,
+  BLS12_381_g2_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* out,
+  BLS12_381_g2_affine_t* points,
+  BLS12_381_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_381(
+  BLS12_381_g2_projective_t* d_out,
+  BLS12_381_scalar_t* d_scalars,
+  BLS12_381_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_381_MSM_H */
--- a/goicicle/curves/bls12381/include/ntt.h
+++ b/goicicle/curves/bls12381/include/ntt.h
@@ -0,0 +1,195 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BLS12_381_NTT_H
+#define _BLS12_381_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_381 projective and affine structs
+typedef struct BLS12_381_projective_t BLS12_381_projective_t;
+typedef struct BLS12_381_affine_t BLS12_381_affine_t;
+typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
+
+typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
+typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
+
+int ntt_cuda_bls12_381(BLS12_381_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_381(BLS12_381_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_381(
+  BLS12_381_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BLS12_381_scalar_t*
+build_domain_cuda_bls12_381(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_evaluations,
+  BLS12_381_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out,
+  BLS12_381_scalar_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_381(
+  BLS12_381_projective_t* d_out,
+  BLS12_381_projective_t* d_coefficients,
+  BLS12_381_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_381_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bls12_381(BLS12_381_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_381(
+  BLS12_381_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bls12_381(BLS12_381_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bls12_381(
+  BLS12_381_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_381(
+  BLS12_381_scalar_t* d_out, BLS12_381_scalar_t* d_in1, BLS12_381_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bls12_381(BLS12_381_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bls12_381(BLS12_381_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bls12_381(BLS12_381_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bls12_381(BLS12_381_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bls12_381(BLS12_381_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bls12_381(BLS12_381_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bls12_381(BLS12_381_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_381_NTT_H */
--- a/goicicle/curves/bls12381/include/projective.h
+++ b/goicicle/curves/bls12381/include/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_381_projective_t BLS12_381_projective_t;
+typedef struct BLS12_381_g2_projective_t BLS12_381_g2_projective_t;
+typedef struct BLS12_381_affine_t BLS12_381_affine_t;
+typedef struct BLS12_381_g2_affine_t BLS12_381_g2_affine_t;
+typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
+
+bool projective_is_on_curve_bls12_381(BLS12_381_projective_t* point1);
+
+int random_scalar_bls12_381(BLS12_381_scalar_t* out);
+int random_projective_bls12_381(BLS12_381_projective_t* out);
+BLS12_381_projective_t* projective_zero_bls12_381();
+int projective_to_affine_bls12_381(BLS12_381_affine_t* out, BLS12_381_projective_t* point1);
+int projective_from_affine_bls12_381(BLS12_381_projective_t* out, BLS12_381_affine_t* point1);
+
+int random_g2_projective_bls12_381(BLS12_381_g2_projective_t* out);
+int g2_projective_to_affine_bls12_381(BLS12_381_g2_affine_t* out, BLS12_381_g2_projective_t* point1);
+int g2_projective_from_affine_bls12_381(BLS12_381_g2_projective_t* out, BLS12_381_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_381(BLS12_381_g2_projective_t* point1);
+
+bool eq_bls12_381(BLS12_381_projective_t* point1, BLS12_381_projective_t* point2);
+bool eq_g2_bls12_381(BLS12_381_g2_projective_t* point1, BLS12_381_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
--- a/goicicle/curves/bls12381/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12381/include/ve_mod_mult.h
@@ -0,0 +1,49 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BLS12_381_VEC_MULT_H
+#define _BLS12_381_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_381_projective_t BLS12_381_projective_t;
+typedef struct BLS12_381_scalar_t BLS12_381_scalar_t;
+
+int32_t vec_mod_mult_point_bls12_381(
+  BLS12_381_projective_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_381(
+  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bls12_381(
+  BLS12_381_scalar_t* inout, BLS12_381_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_381(
+  BLS12_381_scalar_t* matrix_flattened,
+  BLS12_381_scalar_t* input,
+  BLS12_381_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_381_VEC_MULT_H */
--- a/goicicle/curves/bls12381/msm.go
+++ b/goicicle/curves/bls12381/msm.go
@@ -0,0 +1,209 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "msm.h"
+import "C"
+
+func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_381_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G1ProjectivePoint, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		var p G1ProjectivePoint
+		p.SetZero()
+
+		out[i] = p
+	}
+
+	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_381_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G2Point, batchSize)
+
+	outC := (*C.BLS12_381_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_381_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_381_projective_t)(d_out)
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_381_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_381_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_g2_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_381_projective_t)(d_out)
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_381_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bls12_381(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_381_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.msm_batch_g2_cuda_bls12_381(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bls12381/msm_test.go
+++ b/goicicle/curves/bls12381/msm_test.go
@@ -0,0 +1,360 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func GeneratePoints(count int) []G1PointAffine {
+	// Declare a slice of integers
+	var points []G1PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		var pointProjective G1ProjectivePoint
+		pointProjective.Random()
+
+		var pointAffine G1PointAffine
+		pointAffine.FromProjective(&pointProjective)
+
+		points = append(points, pointAffine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func GeneratePointsProj(count int) []G1ProjectivePoint {
+	// Declare a slice of integers
+	var points []G1ProjectivePoint
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		var p G1ProjectivePoint
+		p.Random()
+
+		points = append(points, p)
+	}
+
+	return points
+}
+
+func GenerateScalars(count int, skewed bool) []G1ScalarField {
+	// Declare a slice of integers
+	var scalars []G1ScalarField
+
+	var rand G1ScalarField
+	var zero G1ScalarField
+	var one G1ScalarField
+	var randLarge G1ScalarField
+
+	zero.SetZero()
+	one.SetOne()
+	randLarge.Random()
+
+	if skewed && count > 1_200_000 {
+		for i := 0; i < count-1_200_000; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+
+		for i := 0; i < 600_000; i++ {
+			scalars = append(scalars, randLarge)
+		}
+		for i := 0; i < 400_000; i++ {
+			scalars = append(scalars, zero)
+		}
+		for i := 0; i < 200_000; i++ {
+			scalars = append(scalars, one)
+		}
+	} else {
+		for i := 0; i < count; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+	}
+
+	return scalars[:count]
+}
+
+func TestMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G1ProjectivePoint)
+		startTime := time.Now()
+		_, e := Msm(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1<<v - 1
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := count * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G1ProjectivePoint, 1)
+		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.True(t, outHost[0].IsOnCurve())
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
+
+				if e != 0 {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestBatchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GeneratePoints(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_381 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G1ProjectivePoint)
+				_, e := Msm(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+// G2
+func GenerateG2Points(count int) []G2PointAffine {
+	// Declare a slice of integers
+	var points []G2PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		fmt.Print() // this prevents the test from hanging. TODO: figure out why
+		var p G2Point
+		p.Random()
+		var affine G2PointAffine
+		affine.FromProjective(&p)
+
+		points = append(points, affine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func TestMsmG2BLS12_381(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func BenchmarkMsmG2BLS12_381(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GenerateG2Points(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.IsOnCurve())
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GenerateG2Points(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_381 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
--- a/goicicle/curves/bls12381/ntt.go
+++ b/goicicle/curves/bls12381/ntt.go
@@ -0,0 +1,222 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "ntt.h"
+import "C"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bls12_381(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bls12_381(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bls12_381(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bls12_381(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 32
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BLS12_381_scalar_t)(dp)
+	scalarsC := (*C.BLS12_381_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_381_scalar_t)(twiddles)
+	cosetPowersC := (*C.BLS12_381_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bls12_381(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bls12_381(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BLS12_381_scalar_t)(scalars_out)
+	scalarsC := (*C.BLS12_381_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_381_scalar_t)(twiddles)
+	coset_powersC := (*C.BLS12_381_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bls12_381(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bls12_381(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_381_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_381_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bls12_381(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_381_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_381_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bls12_381(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bls12_381(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_381_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bls12_381(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_381_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_cuda_bls12_381(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_381_g2_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_g2_cuda_bls12_381(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
--- a/goicicle/curves/bls12381/ntt_test.go
+++ b/goicicle/curves/bls12381/ntt_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"reflect"
+	"testing"
+)
+
+func TestNttBLS12_381Batch(t *testing.T) {
+	count := 1 << 20
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatch(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_381CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestINttBLS12_381CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, true, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_381(t *testing.T) {
+	count := 1 << 3
+
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]G1ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	Ntt(&inttResult, true, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBLS12_381(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars := GenerateScalars(count*batches, false)
+
+	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]G1ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]G1ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatch(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]G1ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bls12_381 function
+		Ntt(&nttResultVecOfVec[i], false, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars := GenerateScalars(nttSize, false)
+
+			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				Ntt(&nttResult, false, 0)
+			}
+		})
+	}
+}
--- a/goicicle/curves/bls12381/utils.go
+++ b/goicicle/curves/bls12381/utils.go
@@ -0,0 +1,38 @@
+package bls12381
+
+import "encoding/binary"
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr4(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
+
+func ConvertUint64ArrToUint32Arr6(arr64 [6]uint64) [12]uint32 {
+	var arr32 [12]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
--- a/goicicle/curves/bls12381/utils_test.go
+++ b/goicicle/curves/bls12381/utils_test.go
@@ -0,0 +1,81 @@
+package bls12381
+
+import (
+	"testing"
+)
+
+func TestConvertUint32ArrToUint64Arr(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input [8]uint32
+		want  [4]uint64
+	}{
+		{
+			name:  "Test with incremental array",
+			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
+			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
+		},
+		{
+			name:  "Test with all zeros",
+			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
+			want:  [4]uint64{0, 0, 0, 0},
+		},
+		{
+			name:  "Test with maximum uint32 values",
+			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
+			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
+		},
+		{
+			name:  "Test with alternating min and max uint32 values",
+			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
+			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
+		},
+		{
+			name:  "Test with alternating max and min uint32 values",
+			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
+			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint32ArrToUint64Arr(tc.input)
+			if got != tc.want {
+				t.Errorf("got %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestConvertUint64ArrToUint32Arr(t *testing.T) {
+	testCases := []struct {
+		name     string
+		input    [6]uint64
+		expected [12]uint32
+	}{
+		{
+			name:     "test one",
+			input:    [6]uint64{1, 2, 3, 4, 5, 6},
+			expected: [12]uint32{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0},
+		},
+		{
+			name:     "test two",
+			input:    [6]uint64{100, 200, 300, 400, 500, 600},
+			expected: [12]uint32{100, 0, 200, 0, 300, 0, 400, 0, 500, 0, 600, 0},
+		},
+		{
+			name:     "test three",
+			input:    [6]uint64{1000, 2000, 3000, 4000, 5000, 6000},
+			expected: [12]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0, 5000, 0, 6000, 0},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint64ArrToUint32Arr6(tc.input)
+			if got != tc.expected {
+				t.Errorf("got %v, want %v", got, tc.expected)
+			}
+		})
+	}
+}
--- a/goicicle/curves/bls12381/vec_mod.go
+++ b/goicicle/curves/bls12381/vec_mod.go
@@ -0,0 +1,42 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "ve_mod_mult.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
+	scalarVec1C := (*C.BLS12_381_scalar_t)(scalarVec1)
+	scalarVec2C := (*C.BLS12_381_scalar_t)(scalarVec2)
+	sizeC := C.size_t(size)
+
+	ret := C.vec_mod_mult_device_scalar_bls12_381(scalarVec1C, scalarVec2C, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error multiplying scalar vectors")
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bn254/g1.go
+++ b/goicicle/curves/bn254/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 8
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BN254_projective_t)(unsafe.Pointer(p))
+	in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bn254(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BN254_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bn254(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBN254
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBN254 to *C.BN254_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BN254_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BN254_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bn254(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BN254_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bn254(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BN254_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bn254(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBN254
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BN254_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BN254_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bn254(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BN254_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bn254(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bn254(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BN254_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bn254(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/goicicle/curves/bn254/g1_test.go
+++ b/goicicle/curves/bn254/g1_test.go
@@ -0,0 +1,198 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBN254One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBN254Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBN254ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 32)
+}
+
+func TestNewPointBN254Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBN254Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBN254StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBN254fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBN254Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBN254FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+	y := [8]uint32{9, 10, 11, 12, 13, 14, 15, 16}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
--- a/goicicle/curves/bn254/g2.go
+++ b/goicicle/curves/bn254/g2.go
@@ -0,0 +1,102 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [4]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	X, Y ExtentionField
+}
+
+type G2Point struct {
+	X, Y, Z ExtentionField
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bn254(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BN254_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bn254(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBN254 to *C.BN254_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BN254_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bn254(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BN254_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BN254_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bn254(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bn254(point)
+
+	return bool(res)
+}
--- a/goicicle/curves/bn254/g2_test.go
+++ b/goicicle/curves/bn254/g2_test.go
@@ -0,0 +1,79 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
+	bytes := element.ToBytesLe()
+
+	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
--- a/goicicle/curves/bn254/include/msm.h
+++ b/goicicle/curves/bn254/include/msm.h
@@ -0,0 +1,94 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BN254_MSM_H
+#define _BN254_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bn254(
+  BN254_projective_t* out, BN254_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bn254(
+  BN254_projective_t* out,
+  BN254_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bn254(
+  BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+int msm_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* out,
+  BN254_g2_affine_t* points,
+  BN254_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bn254(
+  BN254_g2_projective_t* d_out,
+  BN254_scalar_t* d_scalars,
+  BN254_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_MSM_H */
--- a/goicicle/curves/bn254/include/ntt.h
+++ b/goicicle/curves/bn254/include/ntt.h
@@ -0,0 +1,193 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BN254_NTT_H
+#define _BN254_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+
+int ntt_cuda_bn254(BN254_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bn254(BN254_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bn254(BN254_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bn254(
+  BN254_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BN254_scalar_t*
+build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_evaluations,
+  BN254_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bn254(
+  BN254_scalar_t* d_out,
+  BN254_scalar_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bn254(
+  BN254_projective_t* d_out,
+  BN254_projective_t* d_coefficients,
+  BN254_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BN254_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bn254(
+  BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bn254(
+  BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bn254(BN254_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bn254(BN254_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bn254(BN254_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bn254(BN254_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_NTT_H */
--- a/goicicle/curves/bn254/include/projective.h
+++ b/goicicle/curves/bn254/include/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+bool projective_is_on_curve_bn254(BN254_projective_t* point1);
+
+int random_scalar_bn254(BN254_scalar_t* out);
+int random_projective_bn254(BN254_projective_t* out);
+BN254_projective_t* projective_zero_bn254();
+int projective_to_affine_bn254(BN254_affine_t* out, BN254_projective_t* point1);
+int projective_from_affine_bn254(BN254_projective_t* out, BN254_affine_t* point1);
+
+int random_g2_projective_bn254(BN254_g2_projective_t* out);
+int g2_projective_to_affine_bn254(BN254_g2_affine_t* out, BN254_g2_projective_t* point1);
+int g2_projective_from_affine_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bn254(BN254_g2_projective_t* point1);
+
+bool eq_bn254(BN254_projective_t* point1, BN254_projective_t* point2);
+bool eq_g2_bn254(BN254_g2_projective_t* point1, BN254_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
--- a/goicicle/curves/bn254/include/ve_mod_mult.h
+++ b/goicicle/curves/bn254/include/ve_mod_mult.h
@@ -0,0 +1,45 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BN254_VEC_MULT_H
+#define _BN254_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int32_t
+vec_mod_mult_point_bn254(BN254_projective_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t
+vec_mod_mult_scalar_bn254(BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bn254(
+  BN254_scalar_t* inout, BN254_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bn254(
+  BN254_scalar_t* matrix_flattened, BN254_scalar_t* input, BN254_scalar_t* output, size_t n_elments, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_VEC_MULT_H */
--- a/goicicle/curves/bn254/msm.go
+++ b/goicicle/curves/bn254/msm.go
@@ -0,0 +1,209 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "msm.h"
+import "C"
+
+func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BN254_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G1ProjectivePoint, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		var p G1ProjectivePoint
+		p.SetZero()
+
+		out[i] = p
+	}
+
+	outC := (*C.BN254_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G2Point, batchSize)
+
+	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BN254_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_cuda_bn254(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BN254_g2_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_g2_cuda_bn254(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BN254_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bn254(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BN254_g2_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.msm_batch_g2_cuda_bn254(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bn254/msm_test.go
+++ b/goicicle/curves/bn254/msm_test.go
@@ -0,0 +1,360 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func GeneratePoints(count int) []G1PointAffine {
+	// Declare a slice of integers
+	var points []G1PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		var pointProjective G1ProjectivePoint
+		pointProjective.Random()
+
+		var pointAffine G1PointAffine
+		pointAffine.FromProjective(&pointProjective)
+
+		points = append(points, pointAffine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func GeneratePointsProj(count int) []G1ProjectivePoint {
+	// Declare a slice of integers
+	var points []G1ProjectivePoint
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		var p G1ProjectivePoint
+		p.Random()
+
+		points = append(points, p)
+	}
+
+	return points
+}
+
+func GenerateScalars(count int, skewed bool) []G1ScalarField {
+	// Declare a slice of integers
+	var scalars []G1ScalarField
+
+	var rand G1ScalarField
+	var zero G1ScalarField
+	var one G1ScalarField
+	var randLarge G1ScalarField
+
+	zero.SetZero()
+	one.SetOne()
+	randLarge.Random()
+
+	if skewed && count > 1_200_000 {
+		for i := 0; i < count-1_200_000; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+
+		for i := 0; i < 600_000; i++ {
+			scalars = append(scalars, randLarge)
+		}
+		for i := 0; i < 400_000; i++ {
+			scalars = append(scalars, zero)
+		}
+		for i := 0; i < 200_000; i++ {
+			scalars = append(scalars, one)
+		}
+	} else {
+		for i := 0; i < count; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+	}
+
+	return scalars[:count]
+}
+
+func TestMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G1ProjectivePoint)
+		startTime := time.Now()
+		_, e := Msm(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1<<v - 1
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := count * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G1ProjectivePoint, 1)
+		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.True(t, outHost[0].IsOnCurve())
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
+
+				if e != 0 {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestBatchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GeneratePoints(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBN254 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G1ProjectivePoint)
+				_, e := Msm(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+// G2
+func GenerateG2Points(count int) []G2PointAffine {
+	// Declare a slice of integers
+	var points []G2PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		fmt.Print() // this prevents the test from hanging. TODO: figure out why
+		var p G2Point
+		p.Random()
+		var affine G2PointAffine
+		affine.FromProjective(&p)
+
+		points = append(points, affine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func TestMsmG2BN254(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func BenchmarkMsmG2BN254(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GenerateG2Points(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.IsOnCurve())
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GenerateG2Points(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBN254 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
--- a/goicicle/curves/bn254/ntt.go
+++ b/goicicle/curves/bn254/ntt.go
@@ -0,0 +1,222 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "ntt.h"
+import "C"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bn254(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bn254(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bn254(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bn254(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 32
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BN254_scalar_t)(dp)
+	scalarsC := (*C.BN254_scalar_t)(scalars)
+	twiddlesC := (*C.BN254_scalar_t)(twiddles)
+	cosetPowersC := (*C.BN254_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BN254_scalar_t)(scalars_out)
+	scalarsC := (*C.BN254_scalar_t)(scalars)
+	twiddlesC := (*C.BN254_scalar_t)(twiddles)
+	coset_powersC := (*C.BN254_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BN254_scalar_t)(in1_d)
+	in2_dC := (*C.BN254_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BN254_scalar_t)(in1_d)
+	in2_dC := (*C.BN254_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BN254_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_cuda_bn254(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BN254_g2_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_g2_cuda_bn254(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
--- a/goicicle/curves/bn254/ntt_test.go
+++ b/goicicle/curves/bn254/ntt_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"reflect"
+	"testing"
+)
+
+func TestNttBN254Batch(t *testing.T) {
+	count := 1 << 20
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatch(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBN254CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestINttBN254CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, true, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBN254(t *testing.T) {
+	count := 1 << 3
+
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]G1ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	Ntt(&inttResult, true, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBN254(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars := GenerateScalars(count*batches, false)
+
+	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]G1ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]G1ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatch(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]G1ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bn254 function
+		Ntt(&nttResultVecOfVec[i], false, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars := GenerateScalars(nttSize, false)
+
+			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				Ntt(&nttResult, false, 0)
+			}
+		})
+	}
+}
--- a/goicicle/curves/bn254/utils.go
+++ b/goicicle/curves/bn254/utils.go
@@ -0,0 +1,48 @@
+package bn254
+
+import (
+	"encoding/binary"
+	"fmt"
+	"log"
+	"regexp"
+	"runtime"
+	"time"
+)
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
+
+func TimeTrack(start time.Time) {
+	elapsed := time.Since(start)
+
+	// Skip this function, and fetch the PC and file for its parent.
+	pc, _, _, _ := runtime.Caller(1)
+
+	// Retrieve a function object this functions parent.
+	funcObj := runtime.FuncForPC(pc)
+
+	// Regex to extract just the function name (and not the module path).
+	runtimeFunc := regexp.MustCompile(`^.*\.(.*)$`)
+	name := runtimeFunc.ReplaceAllString(funcObj.Name(), "$1")
+
+	log.Println(fmt.Sprintf("%s took %s", name, elapsed))
+}
--- a/goicicle/curves/bn254/utils_test.go
+++ b/goicicle/curves/bn254/utils_test.go
@@ -0,0 +1,81 @@
+package bn254
+
+import (
+	"testing"
+)
+
+func TestConvertUint32ArrToUint64Arr(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input [8]uint32
+		want  [4]uint64
+	}{
+		{
+			name:  "Test with incremental array",
+			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
+			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
+		},
+		{
+			name:  "Test with all zeros",
+			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
+			want:  [4]uint64{0, 0, 0, 0},
+		},
+		{
+			name:  "Test with maximum uint32 values",
+			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
+			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
+		},
+		{
+			name:  "Test with alternating min and max uint32 values",
+			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
+			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
+		},
+		{
+			name:  "Test with alternating max and min uint32 values",
+			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
+			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint32ArrToUint64Arr(tc.input)
+			if got != tc.want {
+				t.Errorf("got %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestConvertUint64ArrToUint32Arr(t *testing.T) {
+	testCases := []struct {
+		name     string
+		input    [4]uint64
+		expected [8]uint32
+	}{
+		{
+			name:     "test one",
+			input:    [4]uint64{1, 2, 3, 4},
+			expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0},
+		},
+		{
+			name:     "test two",
+			input:    [4]uint64{100, 200, 300, 400},
+			expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0},
+		},
+		{
+			name:     "test three",
+			input:    [4]uint64{1000, 2000, 3000, 4000},
+			expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint64ArrToUint32Arr(tc.input)
+			if got != tc.expected {
+				t.Errorf("got %v, want %v", got, tc.expected)
+			}
+		})
+	}
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vitalii	3884ad6411	conditional PIC	2024-01-04 23:43:29 +01:00
DmytroTym	f8610dd5b6	Improved modular multiplier (#289 ) Out implementation of Barrett modular multiplication improved by utilising Karatsuba multiplication and more careful optimisations of lsb and msb multipliers in reduction stage	2023-12-05 13:11:44 +02:00
BigSky	fad317ac77	Docs: Clarify enabling tests in cmake build process (#288 ) Docs: Emphasize enabling tests in cmake build process	2023-11-28 14:43:29 +02:00
Jeremy Felder	856629d6c8	Rust/large bucket factor msm (#271 ) * Update rust bindings to support large_bucket_factor parameter * Special treatment of ones in MSM removed --------- Co-authored-by: DmytroTym <dmytrotym1@gmail.com>	2023-11-26 14:10:30 +02:00
Jeremy Felder	2790f180d6	Update feature_request.md (#278 )	2023-11-26 14:09:58 +02:00
Jeremy Felder	5813f2955a	Update bug_issue.md Update bug issue template to apply correct label	2023-11-26 12:22:38 +02:00
VitaliiH	7baea7cc5a	separable compilation for Rust (#244 ) separable compilation for Rust #244	2023-11-16 08:44:56 +01:00
ImmanuelSegol	29cad66ba6	include colab (#261 )	2023-11-13 08:16:41 +02:00
DmytroTym	e4e9130340	Two curve NTT correctness issue hotfix (#254 )	2023-11-05 08:24:12 +02:00
omahs	5c868abcf9	Fix typos (#257 )	2023-11-02 16:57:55 +02:00
vuittont60	a469fb577b	fix typos in logs (#255 )	2023-11-02 16:57:34 +02:00
liuxiao	fd62fe5ae8	Support bw6-761 (#188 ) Resolves #191 and #113 --------- Co-authored-by: DmytroTym <dmytrotym1@gmail.com> Co-authored-by: ImmanuelSegol <3ditds@gmail.com>	2023-10-21 18:49:06 +03:00
Jeremy Felder	09d8c5da6a	Fix readme badge links, fix CI cpp formatter (#249 )	2023-10-17 17:06:11 +03:00
Jeremy Felder	88c9c8584f	[CI]: Add concurrency groups at workflow level (#238 ) Add concurrency groups at workflow level for CI. Remove dev CI since we no longer use dev branch. Resolves #180	2023-10-17 16:02:31 +03:00
Jeremy Felder	1cf7b2e4ba	Exclude target directory from format checks (#247 )	2023-10-16 15:56:58 +03:00
DmytroTym	028bed11fa	Hotfix to go regression when 2 curves are imported (#245 ) Hotfix to slowdown in go when more than one curve is imported.	2023-10-12 15:53:20 +03:00
ImmanuelSegol	9114ecb269	fix memory error in single_stage_multi_reduction_kernel (#235 ) * refactor * refactor * revert * refactor: clang format * Update icicle/appUtils/msm/msm.cu	2023-10-03 15:22:28 +03:00
Jeremy Felder	97f0079e5c	Fix: div by 0 when number of Elements is 1 (#230 )	2023-09-28 16:11:17 +03:00
ImmanuelSegol	9f6707581e	Make dependency instructions more clear (#227 )	2023-09-27 12:25:26 +03:00
Jeremy Felder	413e1d8b60	[CI]: Adds C++/CUDA CI and conditional workflow runs (#223 ) * Add cmake tests for cpp primitives * Add cpp/cuda formatting * Add conditional steps based on files changed for faster required checks * Update runs-on for check files changed	2023-09-26 14:41:06 +03:00
ImmanuelSegol	8612975e36	update docs (#222 ) * refactor * lint * Typo and spacing --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2023-09-26 11:46:11 +03:00
Leon Hibnik	0a639783e2	add mont_r and mont_r_inv and fix args (#187 ) Resolves #186	2023-09-26 09:45:10 +03:00
ImmanuelSegol	e368f00bb8	Fix CI - dont use deprecated package name (#216 ) Update icicle crate name in examples and benches	2023-09-21 16:03:44 +07:00
ImmanuelSegol	08862134e8	fix cuda test - remove boost (#197 ) * refactor: remove boost	2023-09-20 10:00:34 +03:00
Jeremy Felder	04e5ff5d1a	Update root of unity and regenerate omegas (#181 ) updates the root of unity and regenerates the omegas for BLS12377 adds an option for the new_curve script to only update the params.cuh file instead of regenerating everything	2023-09-07 08:23:43 +03:00
Jeremy Felder	81e40a1001	Merge pull request #172 from ingonyama-zk/dev	2023-09-05 10:52:10 +03:00
Leon Hibnik	a8c539740a	Merge pull request #176 from ingonyama-zk/omershlo-patch-1 Update README.md	2023-09-03 11:09:22 +03:00
Leon Hibnik	1cf711215b	Merge pull request #173 from ingonyama-zk/Otsar-Raikou-patch-1 Update README.md	2023-09-03 11:03:08 +03:00
Shlomtz	3d370a2be3	Update README.md [skip ci]	2023-09-01 15:01:39 +03:00
Shlomtz	49212c540c	Update README.md update hall of fame	2023-09-01 13:47:21 +03:00
Otsar	be68cddd1a	Update README.md	2023-08-31 14:09:44 +03:00
Jeremy Felder	5667f32bfe	Merge branch 'main' into dev	2023-08-31 09:19:44 +03:00
ImmanuelSegol	9ea6bc0bad	Update Cargo.toml (#161 )	2023-08-31 09:05:21 +03:00
Jeremy Felder	fb52650bbc	CI: Additional checks (#155 ) adds CI checks for building and testing Golang bindings adds CI checks for formatting Rust and Golang files Fixes Golang tests for BN254 Splits Actions checks for PR against main into multiple files Resolves #108 Resolves #107 Resolves #138	2023-08-31 09:04:53 +03:00
Jeremy Felder	ca8961501e	Adding changes from main to dev for clean merge back into main (#170 )	2023-08-29 15:53:24 +03:00
DmytroTym	78e20f9add	Minimal correct MSM (#162 )	2023-08-28 10:14:54 +03:00
Jeremy Felder	dc6893732b	Merge pull request #166 from weijiekoh/fix/readme-links Fix broken links in the readme, resolves #164	2023-08-28 09:22:57 +03:00
Koh Wei Jie	175109a070	Fix link to CRV_CONFIG Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2023-08-27 15:48:41 -07:00
Jeremy Felder	b216287de1	Merge pull request #163 from weijiekoh/main Support cmake versions below 3.24.0	2023-08-27 14:25:28 +03:00
ImmanuelSegol	7a2fa20da7	Remove decimation from API (#165 ) Resolves #154	2023-08-27 14:08:56 +03:00
Koh Wei Jie	8e7799b632	fixed broken CRV_TEMPLATE and CRV_CONFIG links in readme	2023-08-24 12:55:15 -07:00
Koh Wei Jie	6dd7722f5d	changed minimum cmake version in readme	2023-08-24 10:03:17 -07:00
ImmanuelSegol	27627ed2c1	refactor (#158 )	2023-08-24 12:02:43 +03:00
Koh Wei Jie	3284cd8dce	updated readme with prerequisites section; added conditional in icicle/CMakeLists.txt to support cmake versions below 3.24	2023-08-23 14:23:46 -07:00
Jeremy Felder	8c7b2bb24a	Add citation to repo (#156 )	2023-08-21 09:25:24 +03:00
Jeremy Felder	b6c87c3fd8	Fix formatting for all files (#153 )	2023-08-20 11:35:28 +03:00
Leon Hibnik	e04bd928e6	Merge pull request #145 from ingonyama-zk/fix/goicicle-setup-script setup.sh update	2023-08-17 12:08:24 +03:00
Leon Hibnik	cb6ed6af59	Merge branch 'dev' into fix/goicicle-setup-script	2023-08-17 12:07:07 +03:00
Jeremy Felder	9ea3350589	Add language formatters (#132 )	2023-08-17 09:41:58 +03:00
Leon Hibnik	f38a9a322c	Merge branch 'dev' into fix/goicicle-setup-script	2023-08-16 16:43:32 +03:00
ImmanuelSegol	ad1e482252	missing functions (#152 )	2023-08-16 16:38:20 +03:00
Jeremy Felder	273bd536db	Merge branch 'dev' into fix/goicicle-setup-script	2023-08-16 16:14:36 +03:00
Jeremy Felder	1463edc413	CI: Run linux on self-hosted, Make windows download smaller and remove caching (#150 ) (#151 )	2023-08-16 16:14:22 +03:00
Jeremy Felder	db93204dc7	CI: Run linux on self-hosted, Make windows download smaller and remove caching (#150 )	2023-08-16 15:16:32 +03:00
ImmanuelSegol	e1b692b8ed	Merge branch 'dev' into fix/goicicle-setup-script	2023-08-16 12:59:44 +03:00
Leon Hibnik	e6416f4110	Merge pull request #146 from ingonyama-zk/fix/zeroedgecase bucket_method_msm - address 0 edge case	2023-08-16 12:00:46 +03:00
ImmanuelSegol	96facd58d5	refactor: dont throw error when all scalars are 0	2023-08-15 19:53:45 +03:00
Leon Hibnik	11fe11b071	Merge branch 'dev' into fix/goicicle-setup-script	2023-08-15 14:16:54 +03:00
DmytroTym	19d0730aad	Correct MSM for weird scalar distributions (#143 )	2023-08-15 13:14:46 +03:00
LeonHibnik	36133ba26c	setup.sh update	2023-08-15 12:14:06 +03:00
ImmanuelSegol	a1d9fa6648	fix - add missing go wrappers for all curves + add missing constants for curves (#130 )	2023-08-14 13:15:56 +03:00
Vitalii Hnatyk	2f21ec4aa7	large_msm compilation hotfix (#131 ) hotfix for missing parameter in large_msm	2023-07-27 10:05:45 +02:00
Jeremy Felder	5b504c44b7	Fix badges	2023-07-20 08:57:50 +03:00
Jeremy Felder	d13143506e	writing .so file requires sudo	2023-07-19 21:44:12 +03:00
ImmanuelSegol	94c73e637c	Fix/cudacodegoicile (#128 ) * refactor * refactor * Refactor * Refactor * refactor: add sh script * refactor * refactor * refactor: fix path	2023-07-19 21:30:20 +03:00
Jeremy Felder	b71b041561	Integrate msm performance improvements (#129 )	2023-07-19 16:32:59 +03:00
ImmanuelSegol	a9d6ac0e27	move header file import (#127 )	2023-07-19 08:33:05 +03:00
ImmanuelSegol	8a11a2f60e	some minor changes (#125 ) Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2023-07-18 20:17:16 +03:00
ImmanuelSegol	ab69139ade	Goicicle (#77 )	2023-07-16 14:31:41 +03:00
Vitalii Hnatyk	7a8191bcb4	NTT improvements (shared mem + inplace) (#116 ) Resolves #112	2023-07-16 13:56:20 +03:00
Jeremy Felder	e3f089f0f3	Fix: Docs, Curve generation script (#102 ) Fix CUDA compilation docs and update new curve script to generate correct cu(h) files. Resolves #101	2023-06-28 15:55:33 +03:00
Jeremy Felder	cb61755c8b	Add streams to poseidon (#105 ) Adds streams to the poseidon implementation for BLS12-381. Resolves #91	2023-06-20 12:22:16 +03:00
Jeremy Felder	34a556ac85	Update build workflow (#104 )	2023-06-19 14:21:46 +03:00
Jeremy Felder	2a3f5a258a	Merge pull request #106 from gkigiermo/fix/cuda-test-suite Resolves #103	2023-06-15 11:48:36 +03:00
Guillermo Oyarzun	9023daeb4f	Add c++17 requirement to cmake	2023-06-15 10:06:02 +02:00
Guillermo Oyarzun	4d83ba101c	Fix curve config location and link to some namespace struct members	2023-06-13 23:52:48 +02:00
ChickenLover	26f2f5c76c	reduce memory consumption in hash_blocks (#100 ) * reduce memory consumption in hash_blocks	2023-06-08 20:51:41 +07:00
Jeremy Felder	434ab70305	Fixed omega retrieval issue (#99 ) Resolves #71	2023-06-08 11:47:41 +03:00