Merge remote-tracking branch 'origin/dev' into yshekel/ntt_integration

Implement Poseidon and TreeBuilder (#352 )
* BW scalar field is now the same as BLS base field * add poseidon * add merkle tree builder * poseidon rust bindings * implement rust bindings * add doc comments * remove global poseidon constants * add custom constants API and script for generating new constants * add the rest of the curves for poseidon * add all the curves for real * misname bls12-377 * typo * partial rounds * minor fixes * small tweak for big performance boost * add CHK_INIT_IF_RETURN --------- Co-authored-by: DmytroTym <dmytrotym1@gmail.com>
2026-01-13 01:17:57 -05:00 · 2024-02-06 19:42:32 +02:00 · 2024-02-07 00:31:49 +07:00 · 2024-02-06 18:09:12 +02:00 · 2024-02-06 13:15:52 +02:00 · 2024-02-06 11:54:32 +02:00
336 changed files with 74839 additions and 22284 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,39 @@
+Language: Cpp
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveMacros: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterFunction: true
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: true
+ColumnLimit: 120
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DisableFormat: false
+IndentFunctionDeclarationAfterType: false
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: false
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: All
+PointerAlignment: Left
+SortIncludes: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+Standard: c++17
+UseTab: Never
--- a/.codespellignore
+++ b/.codespellignore
@@ -0,0 +1,3 @@
+inout
+crate
+lmit
--- a/.github/ISSUE_TEMPLATE/bug_issue.md
+++ b/.github/ISSUE_TEMPLATE/bug_issue.md
@@ -2,7 +2,7 @@
 name: ":bug: Bug Report"
 about: Create a bug report to help us improve the repo
 title: "[BUG]: "
-labels: bug
+labels: type:bug
 ---

 ## Description
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,7 +2,7 @@
 name: ":sparkles: Feature Request"
 about: Request the inclusion of a new feature or functionality
 title: "[FEAT]: "
-labels: enhancement
+labels: type:feature
 ---

 ## Description
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -0,0 +1,13 @@
+golang:
+  - goicicle/**/*.go'
+  - go.mod
+rust:
+  - wrappers/rust
+cpp:
+  - icicle/**/*.cu
+  - icicle/**/*.cuh
+  - icicle/**/*.cpp
+  - icicle/**/*.hpp
+  - icicle/**/*.c
+  - icicle/**/*.h
+  - icicle/CMakeLists.txt
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,49 +0,0 @@
-name: Build
-
-on: 
-  pull_request:
-    branches:
-      - "main"
-      - "dev"
-    paths:
-      - "icicle/**"
-      - "src/**"
-      - "Cargo.toml"
-      - "build.rs"
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: sm_70
-  DEFAULT_STREAM: per-thread
-
-jobs:
-  build-linux:
-    runs-on: ubuntu-latest
-
-    steps:
-    # Checkout code
-    - uses: actions/checkout@v3
-    # Download (or from cache) and install CUDA Toolkit 12.1.0
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-      # Build from cargo - Rust utils are preinstalled on latest images
-      # https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
-    - name: Build
-      run: cargo build --release --verbose
-      
-  
-  build-windows:
-    runs-on: windows-latest
-
-    steps:     
-    - uses: actions/checkout@v3
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-    - name: Build
-      run: cargo build --release --verbose
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,20 @@
+name: Check Spelling
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+jobs:
+  spelling-checker:
+    name: Check Spelling
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: codespell-project/actions-codespell@v2
+        with:
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
+          skip: ./**/target,./**/build
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
+          ignore_words_file: .codespellignore
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -0,0 +1,50 @@
+# This workflow is a demo of how to run all examples in the Icicle repository.
+# For each language directory (c++, Rust, etc.) the workflow 
+#   (1) loops over all examples (msm, ntt, etc.) and 
+#   (2) runs ./compile.sh and ./run.sh in each directory.
+# The script ./compile.sh should compile the example and ./run.sh should run it.
+# Each script should return 0 for success and 1 otherwise.
+
+name: Examples
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+jobs:  
+  test-examples:
+    runs-on: [self-hosted, Linux, X64, icicle] # ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: c++ examples
+      working-directory: ./examples/c++
+      run: |
+        # loop over all directories in the current directory
+        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
+          if [ -d "$dir" ]; then
+            echo "Running command in $dir"
+            cd $dir
+            ./compile.sh
+            ./run.sh
+            cd -
+          fi
+        done    
+    - name: Rust examples
+      working-directory: ./examples/rust
+      run: |
+        # loop over all directories in the current directory
+        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
+          if [ -d "$dir" ]; then
+            echo "Running command in $dir"
+            cd $dir
+            cargo run --release
+            cd -
+          fi
+        done      
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -0,0 +1,115 @@
+name: Build
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  build-rust-linux:
+    name: Build Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build Rust
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-rust-windows:
+    name: Build Rust on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build Rust Targets
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  # TODO: Re-enable once Golang bindings for v1+ is finished
+  # build-golang-linux:
+  #   name: Build Golang on Linux
+  #   runs-on: [self-hosted, Linux, X64, icicle]
+  #   needs: check-changed-files
+  #   steps:
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Build CUDA libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
+
+  # TODO: Add once Golang make file supports building for Windows
+  # build-golang-windows:
+  #   name: Build Golang on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust"]'
+  #   - name: Build cpp libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -0,0 +1,47 @@
+name: Format
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  formatting-rust:
+    name: Check Rust Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check rustfmt
+      working-directory: ./wrappers/rust
+      # "-name tagret -prune" removes searching in any directory named "target"
+      # Formatting by single file is necessary due to generated files not being present
+      # before building the project.
+      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
+      # causing rustfmt to fail.
+      run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
+    # - name: Check clippy
+    #   run: cargo clippy --no-deps --all-features --all-targets
+
+  formatting-golang:
+    name: Check Golang Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check gofmt
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  formatting-cpp-cuda:
+    name: Check C++/CUDA Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check clang-format
+      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -0,0 +1,94 @@
+name: Test
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  test-rust-linux:
+    name: Test Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Run Rust Tests
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Running tests from the root workspace will run all workspace members' tests by default
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+
+  test-cpp-linux:
+    name: Test C++ on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build C++
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build
+        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
+        cmake --build build
+    - name: Run C++ Tests
+      working-directory: ./icicle/build
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+  
+  # TODO: Re-enable once Golang bindings for v1+ is finished
+  # test-golang-linux:
+  #   name: Test Golang on Linux
+  #   runs-on: [self-hosted, Linux, X64, icicle]
+  #   needs: check-changed-files
+  #   steps:
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Build CUDA libs
+  #     working-directory: ./goicicle
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make libbn254.so
+  #   - name: Run Golang Tests
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: |
+  #       export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
+  #       go test ./goicicle/curves/bn254 -count=1
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,9 @@
 *.cubin
 *.bin
 *.fatbin
+*.so
+*.nsys-rep
+*.ncu-rep
 **/target
 **/.vscode
 **/.*lock*csv#
@@ -12,3 +15,7 @@
 **/.DS_Store
 **/Cargo.lock
 **/icicle/build/
+**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
+**/build
+**/icicle/appUtils/large_ntt/work
+icicle/appUtils/large_ntt/work/test_ntt
--- a/.rustfmt.toml
+++ b/.rustfmt.toml
@@ -0,0 +1,10 @@
+# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md
+
+# Stable Configs
+chain_width = 0
+max_width = 120
+merge_derives = true
+use_field_init_shorthand = true
+use_try_shorthand = true
+
+# Unstable Configs
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Ingonyama"
+title: "ICICLE: GPU Library for ZK Acceleration"
+version: 1.0.0
+date-released: 2024-01-04
+url: "https://github.com/ingonyama-zk/icicle"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +0,0 @@
-[workspace]
-name = "icicle"
-version = "0.1.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-members = ["icicle-core", "bls12-381", "bls12-377", "bn254"]
-
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Golang
+ENV GOLANG_VERSION 1.21.1
+RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -1,150 +1,124 @@
 # ICICLE
- <div align="center">Icicle is a library for ZK acceleration using CUDA-enabled GPUs.</div>
+ **<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**

                  
-![image (4)](https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png)
+<p align="center">
+  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
+</p>
+
+<p align="center">
+  <a href="https://discord.gg/EVVXTdt6DF">
+    <img src="https://img.shields.io/discord/1063033227788423299?logo=discord" alt="Chat with us on Discord">
+  </a>
+  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
+    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
+  </a>
+</p>

 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.

-If we want ZK hardware today we have FPGAs or GPUs which are relatively inexpensive. However, the biggest selling point of GPUs is the software; we talk in particular about CUDA, which makes it easy to write code running on Nvidia GPUs, taking advantage of their highly parallel architecture. Together with the widespread availability of these devices, if we can get GPUs to work on ZK workloads, then we have made a giant step towards accessible and efficient ZK provers.
+We believe GPUs are as important for ZK as for AI.

-## Zero Knowledge on GPU
+- GPUs are a perfect match for ZK compute - around 97% of ZK protocol runtime is parallel by nature.
+- GPUs are simple for developers to use and scale compared to other hardware platforms.
+- GPUs are extremely competitive in terms of power / performance and price (3x cheaper).
+- GPUs are popular and readily available.

-ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE currently provides support for MSM, NTT, and ECNTT, with plans to support Hash functions soon.
+## Getting Started

-### Supported primitives
+ICICLE is a CUDA implementation of general functions widely used in ZKP.

- Fields
-    - Scalars
-    - Points
-        - Projective: {x, y, z}
-        - Affine: {x, y}
- Curves
-    - [BLS12-381]
-    - [BLS12-377]
-    - [BN254]
+> [!NOTE]
+> Developers: We highly recommend reading our [documentation]

-## Build and usage
+> [!TIP]
+> Try out ICICLE by running some [examples] using ICICLE in C++ and our Rust bindings 

-> NOTE: [NVCC] is a prerequisite for building.
+### Prerequisites

-1. Define or select a curve for your application; we've provided a [template][CRV_TEMPLATE] for defining a curve
-2. Include the curve in [`curve_config.cuh`][CRV_CONFIG]
-3. Now you can build the ICICLE library using nvcc
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 12.0 or newer.
+- [CMake]((https://cmake.org/files/)), version 3.18 and above. Latest version is recommended.
+- [GCC](https://gcc.gnu.org/install/download.html) version 9, latest version is recommended.
+- Any Nvidia GPU (which supports CUDA Toolkit version 12.0 or above).

-```sh
-mkdir -p build
-nvcc -o build/<ENTER_DIR_NAME> ./icicle/appUtils/ntt/ntt.cu ./icicle/appUtils/msm/msm.cu ./icicle/appUtils/vector_manipulation/ve_mod_mult.cu ./icicle/primitives/projective.cu -lib -arch=native
-```
+> [!NOTE] 
+> It is possible to use CUDA 11 for cards which dont support CUDA 12, however we dont officially support this version and in the future there may be issues.

-### Testing the CUDA code
+### Accessing Hardware

-We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:
+If you don't have access to a Nvidia GPU we have some options for you. 

-```sh
-mkdir -p build
-cmake -S . -B build
-cmake --build build
-cd build && ctest
-```
+Checkout [Google Colab](https://colab.google/). Google Colab offers a free [T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/) instance and ICICLE can be used with it, reference this guide for setting up your [Google Colab workplace][GOOGLE-COLAB-ICICLE].

-### Rust Bindings
+If you require more compute and have an interesting research project, we have [bounty and grant programs][GRANT_PROGRAM].

-For convenience, we also provide rust bindings to the ICICLE library for the following primitives:

- MSM
- NTT
-    - Forward NTT
-    - Inverse NTT
- ECNTT
-    - Forward ECNTT
-    - Inverse NTT
- Scalar Vector Multiplication
- Point Vector Multiplication
+### Build systems

-A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environement variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.
+ICICLE has three build systems.

-> NOTE: A GPU must be detectable and therefore installed if the `ARCH_TYPE` is not set.
+- [ICICLE core][ICICLE-CORE], C++ and CUDA
+- [ICICLE Rust][ICICLE-RUST] bindings, requires [Rust](https://www.rust-lang.org/) version 1.70 and above
+- [ICICLE Golang][ICICLE-GO] bindings, requires [Go](https://go.dev/) version 1.20 and above

-Once you have your parameters set, run:
+ICICLE core always needs to be built as part of the other build systems as it contains the core ICICLE primitives implemented in CUDA. Reference these guides for the different build systems, [ICICLE core guide][ICICLE-CORE-README], [ICICLE Rust guide][ICICLE-RUST-README] and [ICICLE Golang guide][ICICLE-GO-README].

-```sh
-cargo build --release
-```
+### Compiling ICICLE

-You'll find a release ready library at `target/release/libicicle_utils.rlib`.
+Running ICICLE via Rust bindings is highly recommended and simple:
+- Clone this repo
+  - go to our [Rust bindings][ICICLE-RUST]
+  - Enter a [curve](./wrappers/rust/icicle-curves) implementation
+  - run `cargo build --release` to build or `cargo test -- --test-threads=1` to build and execute tests

-To benchmark and test the functionality available in RUST, run:
+In any case you would want to compile and run core icicle c++ tests, just follow these setps:
+- Clone this repo
+  - go to [ICICLE core][ICICLE-CORE]
+  - execute the small [script](https://github.com/ingonyama-zk/icicle/tree/main/icicle#running-tests) to compile via cmake and run c++ and cuda tests
+
+## Docker
+
+We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.

 ```
-cargo bench
-cargo test -- --test-threads=1
+docker build -t <name_of_your_choice> .
+docker run --gpus all -it <name_of_your_choice> /bin/bash
 ```

-The flag `--test-threads=1` is needed because currently some tests might interfere with one another inside the GPU.
-
-### Example Usage
-
-An example of using the Rust bindings library can be found in our [fast-danksharding implementation][FDI]
-
-### Supporting Additional Curves
-
-Supporting additional curves can be done as follows:
-
-Create a JSON file with the curve parameters. The curve is defined by the following parameters: 
- ``curve_name`` - e.g. ``bls12_381``.
- ``modolus_p`` - scalar field modolus (in decimal).
- ``bit_count_p`` - number of bits needed to represent `` modolus_p`` .
- ``limb_p`` - number of bytes needed to represent `` modolus_p``  (rounded).
- ``ntt_size`` - log of the maximal size subgroup of the scalar field.    
- ``modolus_q`` - base field modulus (in decimal).
- ``bit_count_q`` - number of bits needed to represent `` modolus_q`` .
- ``limb_q`` number of bytes needed to represent `` modolus_p``  (rounded).
- ``weierstrass_b`` - Weierstrauss constant of the curve. 
- ``gen_x`` - x-value of a generator element for the curve. 
- ``gen_y`` - y-value of a generator element for the curve.
-
-Here's an example for BLS12-381.
-```
-{
-    "curve_name" : "bls12_381", 
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
-    "bit_count_p" : 255,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
-    "bit_count_q" : 381,
-    "limb_q" : 12,
-    "weierstrass_b" : 4,
-    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
-}
-```
-
-Save the parameters JSON file in ``curve_parameters``.
-
-Then run the Python script ``new_curve_script.py `` from the main icicle folder:
-
-```
-python3 ./curve_parameters/new_curve_script_rust.py ./curve_parameters/bls12_381.json
-```
-
-The script does the following:
- Creates a folder in ``icicle/curves`` with the curve name, which contains all of the files needed for the supported operations in cuda.
- Adds the curve exported operations to ``icicle/curves/index.cu``. 
- Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. 
- Creates a test file with the curve name in ``src``. 
-
-Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``).
 ## Contributions

-Join our [Discord Server](https://discord.gg/Y4SkbDf2Ff) and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+
+### Development Contributions
+
+If you are changing code, please make sure to change your [git hooks path][HOOKS_DOCS] to the repo's [hooks directory][HOOKS_PATH] by running the following command:
+
+```sh
+git config core.hooksPath ./scripts/hooks
+```
+
+In case `clang-format` is missing on your system, you can install it  using the following command:
+
+```sh
+sudo apt install clang-format
+```
+
+You will also need to install [codespell](https://github.com/codespell-project/codespell?tab=readme-ov-file#installation) to check for typos.
+
+This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.

 ### Hall of Fame

- [Robik](https://github.com/robik75), for his on-going support and mentorship 
+- [Robik](https://github.com/robik75), for his ongoing support and mentorship
+- [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+
+## Help & Support
+
+For help and support talk to our devs in our discord channel ["ICICLE"](https://discord.gg/EVVXTdt6DF) 
+

 ## License

@@ -153,13 +127,26 @@ ICICLE is distributed under the terms of the MIT License.
 See [LICENSE-MIT][LMIT] for details.

 <!-- Begin Links -->
-[BLS12-381]: ./icicle/curves/bls12_381.cuh
+[BLS12-381]: ./icicle/curves/
+[BLS12-377]: ./icicle/curves/
+[BN254]: ./icicle/curves/
+[BW6-671]: ./icicle/curves/
 [NVCC]: https://docs.nvidia.com/cuda/#installation-guides
-[CRV_TEMPLATE]: ./icicle/curves/curve_template.cuh
-[CRV_CONFIG]: ./icicle/curves/curve_config.cuh
-[B_SCRIPT]: ./build.rs
-[FDI]: https://github.com/ingonyama-zk/fast-danksharding
 [LMIT]: ./LICENSE
+[DISCORD]: https://discord.gg/Y4SkbDf2Ff
 [googletest]: https://github.com/google/googletest/
+[HOOKS_DOCS]: https://git-scm.com/docs/githooks
+[HOOKS_PATH]: ./scripts/hooks/
+[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
+[GOOGLE-COLAB-ICICLE]: https://dev.ingonyama.com/icicle/colab-instructions
+[GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
+[ICICLE-CORE]: ./icicle/
+[ICICLE-RUST]: ./wrappers/rust/
+[ICICLE-GO]: ./goicicle/
+[ICICLE-CORE-README]: ./icicle/README.md
+[ICICLE-RUST-README]: ./wrappers/rust/README.md
+[ICICLE-GO-README]: ./goicicle/README.md
+[documentation]: https://dev.ingonyama.com/icicle/overview
+[examples]: ./examples/

 <!-- End Links -->
--- a/bls12-377/Cargo.toml
+++ b/bls12-377/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bls12-377"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bls12-377 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bls12-377/build.rs
+++ b/bls12-377/build.rs
@@ -1,34 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BLS12_377", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bls12-377/src/basic_structs/field.rs
+++ b/bls12-377/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bls12-377/src/basic_structs/mod.rs
+++ b/bls12-377/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bls12-377/src/basic_structs/point.rs
+++ b/bls12-377/src/basic_structs/point.rs
@@ -1,106 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bls12-377/src/basic_structs/scalar.rs
+++ b/bls12-377/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bls12-377/src/curve_structs.rs
+++ b/bls12-377/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<12> for BaseField {
-    const MODOLUS: [u32; 12] = [0x0;12];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,12>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bls12-377/src/from_cuda.rs
+++ b/bls12-377/src/from_cuda.rs
@@ -1,798 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_377::{Fq as Fq_BLS12_377, Fr as Fr_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_377::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_377::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BLS12_377::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bls12-377/src/lib.rs
+++ b/bls12-377/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bls12_377;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bls12-377/src/test_bls12_377.rs
+++ b/bls12-377/src/test_bls12_377.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_377::{Fq as Fq_BLS12_377, Fr as Fr_BLS12_377, G1Affine as G1Affine_BLS12_377, G1Projective as G1Projective_BLS12_377};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BLS12_377 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BLS12_377) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BLS12_377 {
-        Fr_BLS12_377::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BLS12_377 {
-        Fr_BLS12_377::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger384) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BLS12_377 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BLS12_377 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_377::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BLS12_377::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BLS12_377) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BLS12_377 {
-        G1Affine_BLS12_377::new(Fq_BLS12_377::new(self.x.to_ark()), Fq_BLS12_377::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BLS12_377 {
-        G1Affine_BLS12_377::new(
-            Fq_BLS12_377::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_377::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BLS12_377) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bls12_377 {
-    use std::ops::Add;
-    use ark_bls12_377::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/bls12-381/Cargo.toml
+++ b/bls12-381/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bls12-381"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bls12-381 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bls12-381/build.rs
+++ b/bls12-381/build.rs
@@ -1,36 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BLS12_381", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.shared_flag(false);
-    // nvcc.static_flag(true);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bls12-381/src/basic_structs/field.rs
+++ b/bls12-381/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bls12-381/src/basic_structs/mod.rs
+++ b/bls12-381/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bls12-381/src/basic_structs/point.rs
+++ b/bls12-381/src/basic_structs/point.rs
@@ -1,106 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bls12-381/src/basic_structs/scalar.rs
+++ b/bls12-381/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bls12-381/src/curve_structs.rs
+++ b/bls12-381/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<12> for BaseField {
-    const MODOLUS: [u32; 12] = [0x0;12];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,12>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bls12-381/src/from_cuda.rs
+++ b/bls12-381/src/from_cuda.rs
@@ -1,798 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_381::{Fq as Fq_BLS12_381, Fr as Fr_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_381::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BLS12_381::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BLS12_381::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bls12-381/src/lib.rs
+++ b/bls12-381/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bls12_381;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bls12-381/src/test_bls12_381.rs
+++ b/bls12-381/src/test_bls12_381.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bls12_381::{Fq as Fq_BLS12_381, Fr as Fr_BLS12_381, G1Affine as G1Affine_BLS12_381, G1Projective as G1Projective_BLS12_381};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BLS12_381 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BLS12_381) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BLS12_381 {
-        Fr_BLS12_381::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BLS12_381 {
-        Fr_BLS12_381::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger384 {
-        BigInteger384::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger384) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BLS12_381 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BLS12_381 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BLS12_381::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BLS12_381::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BLS12_381) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BLS12_381 {
-        G1Affine_BLS12_381::new(Fq_BLS12_381::new(self.x.to_ark()), Fq_BLS12_381::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BLS12_381 {
-        G1Affine_BLS12_381::new(
-            Fq_BLS12_381::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BLS12_381::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BLS12_381) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bls12_381 {
-    use std::ops::Add;
-    use ark_bls12_381::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/bn254/Cargo.toml
+++ b/bn254/Cargo.toml
@@ -1,34 +0,0 @@
-[package]
-name = "bn254"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-
-[dependencies]
-icicle-core = { path = "../icicle-core" }
-
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bn254 = "0.3.0"
-
-serde = { version = "1.0", features = ["derive"] }
-serde_derive = "1.0"
-serde_cbor = "0.11.2"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-g2 = []
--- a/bn254/build.rs
+++ b/bn254/build.rs
@@ -1,36 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-    let stream_type = env::var("DEFAULT_STREAM").unwrap_or(String::from("legacy"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-    let mut stream = String::from("-default-stream=");
-    stream.push_str(&stream_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.define("FEATURE_BN254", None);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.flag(&stream);
-    nvcc.shared_flag(false);
-    // nvcc.static_flag(true);
-    nvcc.files([
-        "../icicle-cuda/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/bn254/src/basic_structs/field.rs
+++ b/bn254/src/basic_structs/field.rs
@@ -1,4 +0,0 @@
-pub trait Field<const NUM_LIMBS: usize> {
-    const MODOLUS: [u32;NUM_LIMBS];
-    const LIMBS: usize = NUM_LIMBS;
-}
--- a/bn254/src/basic_structs/mod.rs
+++ b/bn254/src/basic_structs/mod.rs
@@ -1,3 +0,0 @@
-pub mod field; 
-pub mod scalar; 
-pub mod point; 
--- a/bn254/src/basic_structs/point.rs
+++ b/bn254/src/basic_structs/point.rs
@@ -1,108 +0,0 @@
-use std::ffi::c_uint;
-
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger256, PrimeField};
-use std::mem::transmute;
-use ark_ff::Field;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-
-use super::scalar::{get_fixed_limbs, self};
-
-
-#[derive(Debug, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointT<BF: scalar::ScalarTrait> {
-    pub x: BF,
-    pub y: BF,
-    pub z: BF,
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> Default for PointT<BF> {
-    fn default() -> Self {
-        PointT::zero()
-    }
-}
-
-impl<BF: DeviceCopy + scalar::ScalarTrait> PointT<BF> {
-    pub fn zero() -> Self {
-        PointT {
-            x: BF::zero(),
-            y: BF::one(),
-            z: BF::zero(),
-        }
-    }
-
-    pub fn infinity() -> Self {
-        Self::zero()
-    }
-}
-
-#[derive(Debug, PartialEq, Clone, Copy, DeviceCopy)]
-#[repr(C)]
-pub struct PointAffineNoInfinityT<BF> {
-    pub x: BF,
-    pub y: BF,
-}
-
-impl<BF: scalar::ScalarTrait> Default for PointAffineNoInfinityT<BF> {
-    fn default() -> Self {
-        PointAffineNoInfinityT {
-            x: BF::zero(),
-            y: BF::zero(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointAffineNoInfinityT<BF> {
-    ///From u32 limbs x,y
-    pub fn from_limbs(x: &[u32], y: &[u32]) -> Self {
-        PointAffineNoInfinityT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y)
-        }
-    }
-
-    pub fn limbs(&self) -> Vec<u32> {
-        [self.x.limbs(), self.y.limbs()].concat()
-    }
-
-    pub fn to_projective(&self) -> PointT<BF> {
-        PointT {
-            x: self.x,
-            y: self.y,
-            z: BF::one(),
-        }
-    }
-}
-
-impl<BF: Copy + scalar::ScalarTrait> PointT<BF>  {
-    pub fn from_limbs(x: &[u32], y: &[u32], z: &[u32]) -> Self {
-        PointT {
-            x: BF::from_limbs(x),
-            y: BF::from_limbs(y),
-            z: BF::from_limbs(z)
-        }
-    }
-
-    pub fn from_xy_limbs(value: &[u32]) -> PointT<BF> {
-        let l = value.len();
-        assert_eq!(l, 3 * BF::base_limbs(), "length must be 3 * {}", BF::base_limbs());
-        PointT {
-            x: BF::from_limbs(value[..BF::base_limbs()].try_into().unwrap()),
-            y: BF::from_limbs(value[BF::base_limbs()..BF::base_limbs() * 2].try_into().unwrap()),
-            z: BF::from_limbs(value[BF::base_limbs() * 2..].try_into().unwrap())
-        }
-    }
-
-    pub fn to_xy_strip_z(&self) -> PointAffineNoInfinityT<BF> {
-        PointAffineNoInfinityT {
-            x: self.x,
-            y: self.y,
-        }
-    }
-}
--- a/bn254/src/basic_structs/scalar.rs
+++ b/bn254/src/basic_structs/scalar.rs
@@ -1,102 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_core::DeviceCopy;
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination};
-
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use super::field::{Field, self};
-
-pub fn get_fixed_limbs<const NUM_LIMBS: usize>(val: &[u32]) -> [u32; NUM_LIMBS] {
-    match val.len() {
-        n if n < NUM_LIMBS => {
-            let mut padded: [u32; NUM_LIMBS] = [0; NUM_LIMBS];
-            padded[..val.len()].copy_from_slice(&val);
-            padded
-        }
-        n if n == NUM_LIMBS => val.try_into().unwrap(),
-        _ => panic!("slice has too many elements"),
-    }
-}
-
-pub trait ScalarTrait{
-    fn base_limbs() -> usize;
-    fn zero() -> Self;
-    fn from_limbs(value: &[u32]) -> Self;
-    fn one() -> Self;
-    fn to_bytes_le(&self) -> Vec<u8>;
-    fn limbs(&self) -> &[u32];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[repr(C)]
-pub struct ScalarT<M, const NUM_LIMBS: usize> {
-    pub(crate) phantom: PhantomData<M>,
-    pub(crate) value : [u32; NUM_LIMBS]
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarTrait for ScalarT<M, NUM_LIMBS>
-where
-    M: Field<NUM_LIMBS>,
-{
-
-    fn base_limbs() -> usize {
-        return NUM_LIMBS; 
-    }
-
-    fn zero() -> Self {
-        ScalarT {
-            value: [0u32; NUM_LIMBS],
-            phantom: PhantomData,
-        }
-    }
-
-    fn from_limbs(value: &[u32]) -> Self {
-        Self {
-            value: get_fixed_limbs(value),
-            phantom: PhantomData,
-        }
-    }
-
-    fn one() -> Self {
-        let mut s = [0u32; NUM_LIMBS];
-        s[0] = 1;
-        ScalarT { value: s, phantom: PhantomData }
-    }
-
-    fn to_bytes_le(&self) -> Vec<u8> {
-        self.value
-            .iter()
-            .map(|s| s.to_le_bytes().to_vec())
-            .flatten()
-            .collect::<Vec<_>>()
-    }
-
-    fn limbs(&self) -> &[u32] {
-        &self.value
-    }
-}
-
-impl<M, const NUM_LIMBS: usize> ScalarT<M, NUM_LIMBS> where M: field::Field<NUM_LIMBS>{
-    pub fn from_limbs_le(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-        Self::from_limbs(value)
-     }
- 
-    pub fn from_limbs_be(value: &[u32]) -> ScalarT<M,NUM_LIMBS> {
-         let mut value = value.to_vec();
-         value.reverse();
-         Self::from_limbs_le(&value)
-     }
- 
-     // Additional Functions
-     pub fn add(&self, other:ScalarT<M, NUM_LIMBS>) -> ScalarT<M,NUM_LIMBS>{  // overload + 
-         return ScalarT{value: [self.value[0] + other.value[0];NUM_LIMBS], phantom: PhantomData }; 
-     }
-}
--- a/bn254/src/curve_structs.rs
+++ b/bn254/src/curve_structs.rs
@@ -1,62 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda_derive::DeviceCopy;
-use std::mem::transmute;
-use rustacuda::prelude::*;
-use rustacuda_core::DevicePointer;
-use rustacuda::memory::{DeviceBox, CopyDestination, DeviceCopy};
-
-use std::marker::PhantomData;
-use std::convert::TryInto;
-
-use crate::basic_structs::point::{PointT, PointAffineNoInfinityT};
-use crate::basic_structs::scalar::ScalarT;
-use crate::basic_structs::field::Field;
-
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct ScalarField;
-impl Field<8> for ScalarField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-#[derive(Debug, PartialEq, Clone, Copy,DeviceCopy)]
-#[repr(C)]
-pub struct BaseField;
-impl Field<8> for BaseField {
-    const MODOLUS: [u32; 8] = [0x0;8];
-}
-
-
-pub type Scalar = ScalarT<ScalarField,8>;
-impl Default for Scalar {
-    fn default() -> Self {
-        Self{value: [0x0;ScalarField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Scalar{}
-
-
-pub type Base = ScalarT<BaseField,8>;
-impl Default for Base {
-    fn default() -> Self {
-        Self{value: [0x0;BaseField::LIMBS], phantom: PhantomData }
-    }
-}
-
-unsafe impl DeviceCopy for Base{}
-
-pub type Point = PointT<Base>;
-pub type PointAffineNoInfinity = PointAffineNoInfinityT<Base>;
-
-extern "C" {
-    fn eq(point1: *const Point, point2: *const Point) -> c_uint;
-}
-
-impl PartialEq for Point {
-    fn eq(&self, other: &Self) -> bool {
-        unsafe { eq(self, other) != 0 }
-    }
-}
--- a/bn254/src/from_cuda.rs
+++ b/bn254/src/from_cuda.rs
@@ -1,797 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-use crate::basic_structs::scalar::ScalarTrait;
-use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-extern "C" {
-    fn msm_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn msm_batch_cuda(
-        out: *mut Point,
-        points: *const PointAffineNoInfinity,
-        scalars: *const Scalar,
-        batch_size: usize,
-        msm_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn commit_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_scalars: DevicePointer<Scalar>,
-        d_points: DevicePointer<PointAffineNoInfinity>,
-        count: usize,
-        batch_size: usize,
-        device_id: usize,
-    ) -> c_uint;
-
-    fn build_domain_cuda(domain_size: usize, logn: usize, inverse: bool, device_id: usize) -> DevicePointer<Scalar>;
-
-    fn ntt_cuda(inout: *mut Scalar, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ecntt_cuda(inout: *mut Point, n: usize, inverse: bool, device_id: usize) -> c_int;
-
-    fn ntt_batch_cuda(
-        inout: *mut Scalar,
-        arr_size: usize,
-        n: usize,
-        inverse: bool,
-    ) -> c_int;
-
-    fn ecntt_batch_cuda(inout: *mut Point, arr_size: usize, n: usize, inverse: bool) -> c_int;
-
-    fn interpolate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>, 
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_evaluations: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn interpolate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_evaluations: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_scalars_on_coset_batch_cuda(
-        d_out: DevicePointer<Scalar>,
-        d_coefficients: DevicePointer<Scalar>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn evaluate_points_on_coset_batch_cuda(
-        d_out: DevicePointer<Point>,
-        d_coefficients: DevicePointer<Point>,
-        d_domain: DevicePointer<Scalar>,
-        domain_size: usize,
-        n: usize,
-        batch_size: usize,
-        coset_powers: DevicePointer<Scalar>,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_scalars_batch_cuda(
-        d_arr: DevicePointer<Scalar>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn reverse_order_points_batch_cuda(
-        d_arr: DevicePointer<Point>,
-        n: usize,
-        batch_size: usize,
-        device_id: usize
-    ) -> c_int;
-
-    fn vec_mod_mult_point(
-        inout: *mut Point,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn vec_mod_mult_scalar(
-        inout: *mut Scalar,
-        scalars: *const Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-
-    fn matrix_vec_mod_mult(
-        matrix_flattened: *const Scalar,
-        input: *const Scalar,
-        output: *mut Scalar,
-        n_elements: usize,
-        device_id: usize,
-    ) -> c_int;
-}
-
-pub fn msm(points: &[PointAffineNoInfinity], scalars: &[Scalar], device_id: usize) -> Point {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-    let mut ret = Point::zero();
-    unsafe {
-        msm_cuda(
-            &mut ret as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            scalars.len(),
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn msm_batch(
-    points: &[PointAffineNoInfinity],
-    scalars: &[Scalar],
-    batch_size: usize,
-    device_id: usize,
-) -> Vec<Point> {
-    let count = points.len();
-    if count != scalars.len() {
-        todo!("variable length")
-    }
-
-    let mut ret = vec![Point::zero(); batch_size];
-
-    unsafe {
-        msm_batch_cuda(
-            &mut ret[0] as *mut _ as *mut Point,
-            points as *const _ as *const PointAffineNoInfinity,
-            scalars as *const _ as *const Scalar,
-            batch_size,
-            count / batch_size,
-            device_id,
-        )
-    };
-
-    ret
-}
-
-pub fn commit(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-) -> DeviceBox<Point> {
-    let mut res = DeviceBox::new(&Point::zero()).unwrap();
-    unsafe {
-        commit_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len(),
-            0,
-        );
-    }
-    return res;
-}
-
-pub fn commit_batch(
-    points: &mut DeviceBuffer<PointAffineNoInfinity>,
-    scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(batch_size).unwrap() };
-    unsafe {
-        commit_batch_cuda(
-            res.as_device_ptr(),
-            scalars.as_device_ptr(),
-            points.as_device_ptr(),
-            scalars.len() / batch_size,
-            batch_size,
-            0,
-        );
-    }
-    return res;
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal(values: &mut [Scalar], device_id: usize, inverse: bool) -> i32 {
-    let ret_code = unsafe {
-        ntt_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    };
-    ret_code
-}
-
-pub fn ntt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, false);
-}
-
-pub fn intt(values: &mut [Scalar], device_id: usize) {
-    ntt_internal(values, device_id, true);
-}
-
-/// Compute an in-place NTT on the input data.
-fn ntt_internal_batch(
-    values: &mut [Scalar],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ntt_batch_cuda(
-            values as *mut _ as *mut Scalar,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ntt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, false);
-}
-
-pub fn intt_batch(values: &mut [Scalar], batch_size: usize, device_id: usize) {
-    ntt_internal_batch(values, 0, batch_size, true);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal(values: &mut [Point], inverse: bool, device_id: usize) -> i32 {
-    unsafe {
-        ecntt_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            inverse,
-            device_id,
-        )
-    }
-}
-
-pub fn ecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, false, device_id);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt(values: &mut [Point], device_id: usize) {
-    ecntt_internal(values, true, device_id);
-}
-
-/// Compute an in-place ECNTT on the input data.
-fn ecntt_internal_batch(
-    values: &mut [Point],
-    device_id: usize,
-    batch_size: usize,
-    inverse: bool,
-) -> i32 {
-    unsafe {
-        ecntt_batch_cuda(
-            values as *mut _ as *mut Point,
-            values.len(),
-            batch_size,
-            inverse,
-        )
-    }
-}
-
-pub fn ecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, false);
-}
-
-/// Compute an in-place iECNTT on the input data.
-pub fn iecntt_batch(values: &mut [Point], batch_size: usize, device_id: usize) {
-    ecntt_internal_batch(values, 0, batch_size, true);
-}
-
-pub fn build_domain(domain_size: usize, logn: usize, inverse: bool) -> DeviceBuffer<Scalar> {
-    unsafe {
-        DeviceBuffer::from_raw_parts(build_domain_cuda(
-            domain_size,
-            logn,
-            inverse,
-            0
-        ), domain_size)
-    }
-}
-
-
-pub fn reverse_order_scalars(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-) {
-    unsafe { reverse_order_scalars_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_scalars_batch(
-    d_scalars: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_scalars_batch_cuda(
-        d_scalars.as_device_ptr(),
-        d_scalars.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn reverse_order_points(
-    d_points: &mut DeviceBuffer<Point>,
-) {
-    unsafe { reverse_order_points_cuda(
-        d_points.as_device_ptr(),
-        d_points.len(),
-        0
-    ); }
-}
-
-pub fn reverse_order_points_batch(
-    d_points: &mut DeviceBuffer<Point>,
-    batch_size: usize,
-) {
-    unsafe { reverse_order_points_batch_cuda(
-        d_points.as_device_ptr(),
-        d_points.len() / batch_size,
-        batch_size,
-        0
-    ); }
-}
-
-pub fn interpolate_scalars(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_scalars_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_scalars_batch(
-    d_evaluations: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_scalars_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe { interpolate_points_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        0
-    ) };
-    return res;
-}
-
-pub fn interpolate_points_batch(
-    d_evaluations: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe { interpolate_points_batch_cuda(
-        res.as_device_ptr(),
-        d_evaluations.as_device_ptr(),
-        d_domain.as_device_ptr(),
-        d_domain.len(),
-        batch_size,
-        0
-    ) };
-    return res;
-}
-
-pub fn evaluate_scalars(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_scalars_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Scalar>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Scalar> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_scalars_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len()).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len(),
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn evaluate_points_on_coset_batch(
-    d_coefficients: &mut DeviceBuffer<Point>,
-    d_domain: &mut DeviceBuffer<Scalar>,
-    batch_size: usize,
-    coset_powers: &mut DeviceBuffer<Scalar>,
-) -> DeviceBuffer<Point> {
-    let mut res = unsafe { DeviceBuffer::uninitialized(d_domain.len() * batch_size).unwrap() };
-    unsafe {
-        evaluate_points_on_coset_batch_cuda(
-            res.as_device_ptr(),
-            d_coefficients.as_device_ptr(),
-            d_domain.as_device_ptr(),
-            d_domain.len(),
-            d_coefficients.len() / batch_size,
-            batch_size,
-            coset_powers.as_device_ptr(),
-            0
-        );
-    }
-    return res;
-}
-
-pub fn multp_vec(a: &mut [Point], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_point(
-            a as *mut _ as *mut Point,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-pub fn mult_sc_vec(a: &mut [Scalar], b: &[Scalar], device_id: usize) {
-    assert_eq!(a.len(), b.len());
-    unsafe {
-        vec_mod_mult_scalar(
-            a as *mut _ as *mut Scalar,
-            b as *const _ as *const Scalar,
-            a.len(),
-            device_id,
-        );
-    }
-}
-
-// Multiply a matrix by a scalar:
-//  `a` - flattenned matrix;
-//  `b` - vector to multiply `a` by;
-pub fn mult_matrix_by_vec(a: &[Scalar], b: &[Scalar], device_id: usize) -> Vec<Scalar> {
-    let mut c = Vec::with_capacity(b.len());
-    for i in 0..b.len() {
-        c.push(Scalar::zero());
-    }
-    unsafe {
-        matrix_vec_mod_mult(
-            a as *const _ as *const Scalar,
-            b as *const _ as *const Scalar,
-            c.as_mut_slice() as *mut _ as *mut Scalar,
-            b.len(),
-            device_id,
-        );
-    }
-    c
-}
-
-pub fn clone_buffer<T: DeviceCopy>(buf: &mut DeviceBuffer<T>) -> DeviceBuffer<T> {
-    let mut buf_cpy = unsafe { DeviceBuffer::uninitialized(buf.len()).unwrap() };
-    unsafe { buf_cpy.copy_from(buf) };
-    return buf_cpy;
-}
-
-pub fn get_rng(seed: Option<u64>) -> Box<dyn RngCore> {
-    let rng: Box<dyn RngCore> = match seed {
-        Some(seed) => Box::new(StdRng::seed_from_u64(seed)),
-        None => Box::new(rand::thread_rng()),
-    };
-    rng
-}
-
-fn set_up_device() {
-    // Set up the context, load the module, and create a stream to run kernels in.
-    rustacuda::init(CudaFlags::empty()).unwrap();
-    let device = Device::get_device(0).unwrap();
-    let _ctx = Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device).unwrap();
-}
-
-pub fn generate_random_points(
-    count: usize,
-    mut rng: Box<dyn RngCore>,
-) -> Vec<PointAffineNoInfinity> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z())
-        .collect()
-}
-
-pub fn generate_random_points_proj(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point> {
-    (0..count)
-        .map(|_| Point::from_ark(G1Projective_BN254::rand(&mut rng)))
-        .collect()
-}
-
-pub fn generate_random_scalars(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Scalar> {
-    (0..count)
-        .map(|_| Scalar::from_ark(Fr_BN254::rand(&mut rng).into_repr()))
-        .collect()
-}
-
-pub fn set_up_points(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Point>, DeviceBuffer<Point>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalar 
-    let vector = generate_random_points_proj(test_size, get_rng(seed));
-    let mut vector_mut = vector.clone();
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
-pub fn set_up_scalars(test_size: usize, log_domain_size: usize, inverse: bool) -> (Vec<Scalar>, DeviceBuffer<Scalar>, DeviceBuffer<Scalar>) {
-    set_up_device();
-
-    let d_domain = build_domain(1 << log_domain_size, log_domain_size, inverse);
-
-    let seed = Some(0); // fix the rng to get two equal scalars
-    let mut vector_mut = generate_random_scalars(test_size, get_rng(seed));
-
-    let mut d_vector = DeviceBuffer::from_slice(&vector_mut[..]).unwrap();
-    (vector_mut, d_vector, d_domain)
-}
-
--- a/bn254/src/lib.rs
+++ b/bn254/src/lib.rs
@@ -1,4 +0,0 @@
-pub mod test_bn254;
-pub mod basic_structs;
-pub mod from_cuda;
-pub mod curve_structs;
--- a/bn254/src/test_bn254.rs
+++ b/bn254/src/test_bn254.rs
@@ -1,816 +0,0 @@
-use std::ffi::{c_int, c_uint};
-use ark_std::UniformRand;
-use rand::{rngs::StdRng, RngCore, SeedableRng};
-use rustacuda::CudaFlags;
-use rustacuda::memory::DeviceBox;
-use rustacuda::prelude::{DeviceBuffer, Device, ContextFlags, Context};
-use rustacuda_core::DevicePointer;
-use std::mem::transmute;
-pub use crate::basic_structs::scalar::ScalarTrait;
-pub use crate::curve_structs::*;
-use icicle_core::utils::{u32_vec_to_u64_vec, u64_vec_to_u32_vec};
-use std::marker::PhantomData;
-use std::convert::TryInto;
-use ark_bn254::{Fq as Fq_BN254, Fr as Fr_BN254, G1Affine as G1Affine_BN254, G1Projective as G1Projective_BN254};
-use ark_ec::AffineCurve;
-use ark_ff::{BigInteger384, BigInteger256, PrimeField};
-use rustacuda::memory::{CopyDestination, DeviceCopy};
-
-
-impl Scalar {
-    pub fn to_biginteger254(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_biginteger256(ark: BigInteger256) -> Self {
-        Self{ value: u64_vec_to_u32_vec(&ark.0).try_into().unwrap(), phantom : PhantomData}
-    }
-
-    pub fn to_biginteger256_transmute(&self) -> BigInteger256 {
-        unsafe { transmute(*self) }
-    }
-
-    pub fn from_biginteger_transmute(v: BigInteger256) -> Scalar {
-        Scalar{ value: unsafe{ transmute(v)}, phantom : PhantomData }
-    }
-
-    pub fn to_ark_transmute(&self) -> Fr_BN254 {
-        unsafe { std::mem::transmute(*self) }
-    }
-
-    pub fn from_ark_transmute(v: &Fr_BN254) -> Scalar {
-        unsafe { std::mem::transmute_copy(v) }
-    }
-
-    pub fn to_ark_mod_p(&self) -> Fr_BN254 {
-        Fr_BN254::new(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap()))
-    }
-
-    pub fn to_ark_repr(&self) -> Fr_BN254 {
-        Fr_BN254::from_repr(BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())).unwrap()
-    }
-
-    pub fn from_ark(v: BigInteger256) -> Scalar {
-        Self { value : u64_vec_to_u32_vec(&v.0).try_into().unwrap(), phantom: PhantomData}
-    }
-
-}
-
-impl Base {
-    pub fn to_ark(&self) -> BigInteger256 {
-        BigInteger256::new(u32_vec_to_u64_vec(&self.limbs()).try_into().unwrap())
-    }
-
-    pub fn from_ark(ark: BigInteger256) -> Self {
-        Self::from_limbs(&u64_vec_to_u32_vec(&ark.0))
-    }
-}
-
-
-impl Point {
-    pub fn to_ark(&self) -> G1Projective_BN254 {
-        self.to_ark_affine().into_projective()
-    }
-
-    pub fn to_ark_affine(&self) -> G1Affine_BN254 {
-        //TODO: generic conversion
-        use ark_ff::Field;
-        use std::ops::Mul;
-        let proj_x_field = Fq_BN254::from_le_bytes_mod_order(&self.x.to_bytes_le());
-        let proj_y_field = Fq_BN254::from_le_bytes_mod_order(&self.y.to_bytes_le());
-        let proj_z_field = Fq_BN254::from_le_bytes_mod_order(&self.z.to_bytes_le());
-        let inverse_z = proj_z_field.inverse().unwrap();
-        let aff_x = proj_x_field.mul(inverse_z);
-        let aff_y = proj_y_field.mul(inverse_z);
-        G1Affine_BN254::new(aff_x, aff_y, false)
-    }
-
-    pub fn from_ark(ark: G1Projective_BN254) -> Point {
-        use ark_ff::Field;
-        let z_inv = ark.z.inverse().unwrap();
-        let z_invsq = z_inv * z_inv;
-        let z_invq3 = z_invsq * z_inv;
-        Point {
-            x: Base::from_ark((ark.x * z_invsq).into_repr()),
-            y: Base::from_ark((ark.y * z_invq3).into_repr()),
-            z: Base::one(),
-        }
-    }
-}
-
-impl PointAffineNoInfinity {
-
-    pub fn to_ark(&self) -> G1Affine_BN254 {
-        G1Affine_BN254::new(Fq_BN254::new(self.x.to_ark()), Fq_BN254::new(self.y.to_ark()), false)
-    }
-
-    pub fn to_ark_repr(&self) -> G1Affine_BN254 {
-        G1Affine_BN254::new(
-            Fq_BN254::from_repr(self.x.to_ark()).unwrap(),
-            Fq_BN254::from_repr(self.y.to_ark()).unwrap(),
-            false,
-        )
-    }
-
-    pub fn from_ark(p: &G1Affine_BN254) -> Self {
-        PointAffineNoInfinity {
-            x: Base::from_ark(p.x.into_repr()),
-            y: Base::from_ark(p.y.into_repr()),
-        }
-    }
-}
-
-impl Point {
-    pub fn to_affine(&self) -> PointAffineNoInfinity {
-        let ark_affine = self.to_ark_affine();
-        PointAffineNoInfinity {
-            x: Base::from_ark(ark_affine.x.into_repr()),
-            y: Base::from_ark(ark_affine.y.into_repr()),
-        }
-    }
-}
-
-
-#[cfg(test)]
-pub(crate) mod tests_bn254 {
-    use std::ops::Add;
-    use ark_bn254::{Fr, G1Affine, G1Projective};
-    use ark_ec::{msm::VariableBaseMSM, AffineCurve, ProjectiveCurve};
-    use ark_ff::{FftField, Field, Zero, PrimeField};
-    use ark_std::UniformRand;
-    use rustacuda::prelude::{DeviceBuffer, CopyDestination};
-    use crate::curve_structs::{Point, Scalar, Base};
-    use crate::basic_structs::scalar::ScalarTrait;
-    use crate::from_cuda::{generate_random_points, get_rng, generate_random_scalars, msm, msm_batch, set_up_scalars, commit, commit_batch, ntt, intt, generate_random_points_proj, ecntt, iecntt, ntt_batch, ecntt_batch, iecntt_batch, intt_batch, reverse_order_scalars_batch, interpolate_scalars_batch, set_up_points, reverse_order_points, interpolate_points, reverse_order_points_batch, interpolate_points_batch, evaluate_scalars, interpolate_scalars, reverse_order_scalars, evaluate_points, build_domain, evaluate_scalars_on_coset, evaluate_points_on_coset, mult_matrix_by_vec, mult_sc_vec, multp_vec,evaluate_scalars_batch, evaluate_points_batch, evaluate_scalars_on_coset_batch, evaluate_points_on_coset_batch};
-
-    fn random_points_ark_proj(nof_elements: usize) -> Vec<G1Projective> {
-        let mut rng = ark_std::rand::thread_rng();
-        let mut points_ga: Vec<G1Projective> = Vec::new();
-        for _ in 0..nof_elements {
-            let aff = G1Projective::rand(&mut rng);
-            points_ga.push(aff);
-        }
-        points_ga
-    }
-
-    fn ecntt_arc_naive(
-        points: &Vec<G1Projective>,
-        size: usize,
-        inverse: bool,
-    ) -> Vec<G1Projective> {
-        let mut result: Vec<G1Projective> = Vec::new();
-        for _ in 0..size {
-            result.push(G1Projective::zero());
-        }
-        let rou: Fr;
-        if !inverse {
-            rou = Fr::get_root_of_unity(size).unwrap();
-        } else {
-            rou = Fr::inverse(&Fr::get_root_of_unity(size).unwrap()).unwrap();
-        }
-        for k in 0..size {
-            for l in 0..size {
-                let pow: [u64; 1] = [(l * k).try_into().unwrap()];
-                let mul_rou = Fr::pow(&rou, &pow);
-                result[k] = result[k].add(points[l].into_affine().mul(mul_rou));
-            }
-        }
-        if inverse {
-            let size2 = size as u64;
-            for k in 0..size {
-                let multfactor = Fr::inverse(&Fr::from(size2)).unwrap();
-                result[k] = result[k].into_affine().mul(multfactor);
-            }
-        }
-        return result;
-    }
-
-    fn check_eq(points: &Vec<G1Projective>, points2: &Vec<G1Projective>) -> bool {
-        let mut eq = true;
-        for i in 0..points.len() {
-            if points2[i].ne(&points[i]) {
-                eq = false;
-                break;
-            }
-        }
-        return eq;
-    }
-
-    fn test_naive_ark_ecntt(size: usize) {
-        let points = random_points_ark_proj(size);
-        let result1: Vec<G1Projective> = ecntt_arc_naive(&points, size, false);
-        let result2: Vec<G1Projective> = ecntt_arc_naive(&result1, size, true);
-        assert!(!check_eq(&result2, &result1));
-        assert!(check_eq(&result2, &points));
-    }
-
-    #[test]
-    fn test_msm() {
-        let test_sizes = [6, 9];
-
-        for pow2 in test_sizes {
-            let count = 1 << pow2;
-            let seed = None; // set Some to provide seed
-            let points = generate_random_points(count, get_rng(seed));
-            let scalars = generate_random_scalars(count, get_rng(seed));
-
-            let msm_result = msm(&points, &scalars, 0);
-
-            let point_r_ark: Vec<_> = points.iter().map(|x| x.to_ark_repr()).collect();
-            let scalars_r_ark: Vec<_> = scalars.iter().map(|x| x.to_ark()).collect();
-
-            let msm_result_ark = VariableBaseMSM::multi_scalar_mul(&point_r_ark, &scalars_r_ark);
-
-            assert_eq!(msm_result.to_ark_affine(), msm_result_ark);
-            assert_eq!(msm_result.to_ark(), msm_result_ark);
-            assert_eq!(
-                msm_result.to_ark_affine(),
-                Point::from_ark(msm_result_ark).to_ark_affine()
-            );
-        }
-    }
-
-    #[test]
-    fn test_batch_msm() {
-        for batch_pow2 in [2, 4] {
-            for pow2 in [4, 6] {
-                let msm_size = 1 << pow2;
-                let batch_size = 1 << batch_pow2;
-                let seed = None; // set Some to provide seed
-                let points_batch = generate_random_points(msm_size * batch_size, get_rng(seed));
-                let scalars_batch = generate_random_scalars(msm_size * batch_size, get_rng(seed));
-
-                let point_r_ark: Vec<_> = points_batch.iter().map(|x| x.to_ark_repr()).collect();
-                let scalars_r_ark: Vec<_> = scalars_batch.iter().map(|x| x.to_ark()).collect();
-
-                let expected: Vec<_> = point_r_ark
-                    .chunks(msm_size)
-                    .zip(scalars_r_ark.chunks(msm_size))
-                    .map(|p| Point::from_ark(VariableBaseMSM::multi_scalar_mul(p.0, p.1)))
-                    .collect();
-
-                let result = msm_batch(&points_batch, &scalars_batch, batch_size, 0);
-
-                assert_eq!(result, expected);
-            }
-        }
-    }
-
-    #[test]
-    fn test_commit() {
-        let test_size = 1 << 8;
-        let seed = Some(0);
-        let (mut scalars, mut d_scalars, _) = set_up_scalars(test_size, 0, false);
-        let mut points = generate_random_points(test_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm(&points, &scalars, 0);
-        let mut d_commit_result = commit(&mut d_points, &mut d_scalars);
-        let mut h_commit_result = Point::zero();
-        d_commit_result.copy_to(&mut h_commit_result).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        assert_ne!(msm_result, Point::zero());
-        assert_ne!(h_commit_result, Point::zero());
-    }
-
-    #[test]
-    fn test_batch_commit() {
-        let batch_size = 4;
-        let test_size = 1 << 12;
-        let seed = Some(0);
-        let (scalars, mut d_scalars, _) = set_up_scalars(test_size * batch_size, 0, false);
-        let points = generate_random_points(test_size * batch_size, get_rng(seed));
-        let mut d_points = DeviceBuffer::from_slice(&points[..]).unwrap();
-
-        let msm_result = msm_batch(&points, &scalars, batch_size, 0);
-        let mut d_commit_result = commit_batch(&mut d_points, &mut d_scalars, batch_size);
-        let mut h_commit_result: Vec<Point> = (0..batch_size).map(|_| Point::zero()).collect();
-        d_commit_result.copy_to(&mut h_commit_result[..]).unwrap();
-
-        assert_eq!(msm_result, h_commit_result);
-        for h in h_commit_result {
-            assert_ne!(h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_ntt() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 3;
-
-        let scalars = generate_random_scalars(test_size, get_rng(seed));
-
-        let mut ntt_result = scalars.clone();
-        ntt(&mut ntt_result, 0);
-
-        assert_ne!(ntt_result, scalars);
-
-        let mut intt_result = ntt_result.clone();
-
-        intt(&mut intt_result, 0);
-
-        assert_eq!(intt_result, scalars);
-
-        //ECNTT
-        let points_proj = generate_random_points_proj(test_size, get_rng(seed));
-
-        test_naive_ark_ecntt(test_size);
-
-        assert!(points_proj[0].to_ark().into_affine().is_on_curve());
-
-        //naive ark
-        let points_proj_ark = points_proj
-            .iter()
-            .map(|p| p.to_ark())
-            .collect::<Vec<G1Projective>>();
-
-        let ecntt_result_naive = ecntt_arc_naive(&points_proj_ark, points_proj_ark.len(), false);
-
-        let iecntt_result_naive = ecntt_arc_naive(&ecntt_result_naive, points_proj_ark.len(), true);
-
-        assert_eq!(points_proj_ark, iecntt_result_naive);
-
-        //ingo gpu
-        let mut ecntt_result = points_proj.to_vec();
-        ecntt(&mut ecntt_result, 0);
-
-        assert_ne!(ecntt_result, points_proj);
-
-        let mut iecntt_result = ecntt_result.clone();
-        iecntt(&mut iecntt_result, 0);
-
-        assert_eq!(
-            iecntt_result_naive,
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-        assert_eq!(
-            iecntt_result
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>(),
-            points_proj
-                .iter()
-                .map(|p| p.to_ark_affine())
-                .collect::<Vec<G1Affine>>()
-        );
-    }
-
-    #[test]
-    fn test_ntt_batch() {
-        //NTT
-        let seed = None; //some value to fix the rng
-        let test_size = 1 << 5;
-        let batches = 4;
-
-        let scalars_batch: Vec<Scalar> =
-            generate_random_scalars(test_size * batches, get_rng(seed));
-
-        let mut scalar_vec_of_vec: Vec<Vec<Scalar>> = Vec::new();
-
-        for i in 0..batches {
-            scalar_vec_of_vec.push(scalars_batch[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result = scalars_batch.clone();
-
-        // do batch ntt
-        ntt_batch(&mut ntt_result, test_size, 0);
-
-        let mut ntt_result_vec_of_vec = Vec::new();
-
-        // do ntt for every chunk
-        for i in 0..batches {
-            ntt_result_vec_of_vec.push(scalar_vec_of_vec[i].clone());
-            ntt(&mut ntt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the ntt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_vec_of_vec[i],
-                ntt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        // check that ntt output is different from input
-        assert_ne!(ntt_result, scalars_batch);
-
-        let mut intt_result = ntt_result.clone();
-
-        // do batch intt
-        intt_batch(&mut intt_result, test_size, 0);
-
-        let mut intt_result_vec_of_vec = Vec::new();
-
-        // do intt for every chunk
-        for i in 0..batches {
-            intt_result_vec_of_vec.push(ntt_result_vec_of_vec[i].clone());
-            intt(&mut intt_result_vec_of_vec[i], 0);
-        }
-
-        // check that the intt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_vec_of_vec[i],
-                intt_result[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result, scalars_batch);
-
-        // //ECNTT
-        let points_proj = generate_random_points_proj(test_size * batches, get_rng(seed));
-
-        let mut points_vec_of_vec: Vec<Vec<Point>> = Vec::new();
-
-        for i in 0..batches {
-            points_vec_of_vec.push(points_proj[i * test_size..(i + 1) * test_size].to_vec());
-        }
-
-        let mut ntt_result_points = points_proj.clone();
-
-        // do batch ecintt
-        ecntt_batch(&mut ntt_result_points, test_size, 0);
-
-        let mut ntt_result_points_vec_of_vec = Vec::new();
-
-        for i in 0..batches {
-            ntt_result_points_vec_of_vec.push(points_vec_of_vec[i].clone());
-            ecntt(&mut ntt_result_points_vec_of_vec[i], 0);
-        }
-
-        for i in 0..batches {
-            assert_eq!(
-                ntt_result_points_vec_of_vec[i],
-                ntt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_ne!(ntt_result_points, points_proj);
-
-        let mut intt_result_points = ntt_result_points.clone();
-
-        // do batch ecintt
-        iecntt_batch(&mut intt_result_points, test_size, 0);
-
-        let mut intt_result_points_vec_of_vec = Vec::new();
-
-        // do ecintt for every chunk
-        for i in 0..batches {
-            intt_result_points_vec_of_vec.push(ntt_result_points_vec_of_vec[i].clone());
-            iecntt(&mut intt_result_points_vec_of_vec[i], 0);
-        }
-
-        // check that the ecintt of each vec of scalars is equal to the intt of the specific batch
-        for i in 0..batches {
-            assert_eq!(
-                intt_result_points_vec_of_vec[i],
-                intt_result_points[i * test_size..(i + 1) * test_size]
-            );
-        }
-
-        assert_eq!(intt_result_points, points_proj);
-    }
-
-    #[test]
-    fn test_scalar_interpolation() {
-        let log_test_size = 7;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size, log_test_size, true);
-
-        reverse_order_scalars(&mut d_evals);
-        let mut d_coeffs = interpolate_scalars(&mut d_evals, &mut d_domain);
-        intt(&mut evals_mut, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_scalar_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 10;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, true);
-
-        reverse_order_scalars_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size);
-        intt_batch(&mut evals_mut, test_size, 0);
-        let mut h_coeffs: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-
-        assert_eq!(h_coeffs, evals_mut);
-    }
-
-    #[test]
-    fn test_point_interpolation() {
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size, log_test_size, true);
-
-        reverse_order_points(&mut d_evals);
-        let mut d_coeffs = interpolate_points(&mut d_evals, &mut d_domain);
-        iecntt(&mut evals_mut[..], 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_interpolation() {
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (mut evals_mut, mut d_evals, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, true);
-
-        reverse_order_points_batch(&mut d_evals, batch_size);
-        let mut d_coeffs = interpolate_points_batch(&mut d_evals, &mut d_domain, batch_size);
-        iecntt_batch(&mut evals_mut[..], test_size, 0);
-        let mut h_coeffs: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs.copy_to(&mut h_coeffs[..]).unwrap();
-        
-        assert_eq!(h_coeffs, *evals_mut);
-        for h in h_coeffs.iter() {
-            assert_ne!(*h, Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation() {
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_scalars(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_coeffs_domain[..coeff_size]);
-        for i in coeff_size.. (1 << log_test_domain_size) {
-            assert_eq!(Scalar::zero(), h_coeffs_domain[i]);
-        }
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation() {
-        let batch_size = 6;
-        let log_test_domain_size = 8;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 6;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_scalars_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Scalar> = (0..domain_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..j * domain_size + coeff_size]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Scalar::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation() {
-        let log_test_domain_size = 7;
-        let coeff_size = 1 << 7;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut d_coeffs_domain = interpolate_points(&mut d_evals, &mut d_domain_inv);
-        let mut h_coeffs_domain: Vec<Point> = (0..1 << log_test_domain_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        assert_eq!(h_coeffs[..], h_coeffs_domain[..coeff_size]);
-        for i in coeff_size..(1 << log_test_domain_size) {
-            assert_eq!(Point::zero(), h_coeffs_domain[i]);
-        }
-        for i in 0..coeff_size {
-            assert_ne!(h_coeffs_domain[i], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation() {
-        let batch_size = 4;
-        let log_test_domain_size = 6;
-        let domain_size = 1 << log_test_domain_size;
-        let coeff_size = 1 << 5;
-        let (h_coeffs, mut d_coeffs, mut d_domain) = set_up_points(coeff_size * batch_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_points(0, log_test_domain_size, true);
-
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut d_coeffs_domain = interpolate_points_batch(&mut d_evals, &mut d_domain_inv, batch_size);
-        let mut h_coeffs_domain: Vec<Point> = (0..domain_size * batch_size).map(|_| Point::zero()).collect();
-        d_coeffs_domain.copy_to(&mut h_coeffs_domain[..]).unwrap();
-
-        for j in 0..batch_size {
-            assert_eq!(h_coeffs[j * coeff_size..(j + 1) * coeff_size], h_coeffs_domain[j * domain_size..(j * domain_size + coeff_size)]);
-            for i in coeff_size..domain_size {
-                assert_eq!(Point::zero(), h_coeffs_domain[j * domain_size + i]);
-            }
-            for i in j * domain_size..(j * domain_size + coeff_size) {
-                assert_ne!(h_coeffs_domain[i], Point::zero());
-            }
-        }
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_trivial_coset() {
-        // checks that the evaluations on the subgroup is the same as on the coset generated by 1
-        let log_test_domain_size = 8;
-        let coeff_size = 1 << 6;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(coeff_size, log_test_domain_size, false);
-        let (_, _, mut d_domain_inv) = set_up_scalars(coeff_size, log_test_domain_size, true);
-        let mut d_trivial_coset_powers = build_domain(1 << log_test_domain_size, 0, false);
-
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_coeffs: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_coeffs[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_trivial_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..1 << log_test_domain_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_coeffs, h_evals_coset);
-    }
-
-    #[test]
-    fn test_scalar_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-    }
-
-    #[test]
-    fn test_scalar_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 4;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_scalars(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_scalars(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_scalars_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Scalar> = (0..2 * test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_scalars_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Scalar> = (0..test_size * batch_size).map(|_| Scalar::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-    }
-
-    #[test]
-    fn test_point_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let log_test_size = 8;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points(&mut d_coeffs, &mut d_large_domain);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points(&mut d_coeffs, &mut d_domain);
-        let mut h_evals: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset(&mut d_coeffs, &mut d_domain, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        assert_eq!(h_evals[..], h_evals_large[..test_size]);
-        assert_eq!(h_evals_coset[..], h_evals_large[test_size..2 * test_size]);
-        for i in 0..test_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    #[test]
-    fn test_point_batch_evaluation_on_coset() {
-        // checks that evaluating a polynomial on a subgroup and its coset is the same as evaluating on a 2x larger subgroup 
-        let batch_size = 2;
-        let log_test_size = 6;
-        let test_size = 1 << log_test_size;
-        let (_, mut d_coeffs, mut d_domain) = set_up_points(test_size * batch_size, log_test_size, false);
-        let (_, _, mut d_large_domain) = set_up_points(0, log_test_size + 1, false);
-        let mut d_coset_powers = build_domain(test_size, log_test_size + 1, false);
-
-        let mut d_evals_large = evaluate_points_batch(&mut d_coeffs, &mut d_large_domain, batch_size);
-        let mut h_evals_large: Vec<Point> = (0..2 * test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
-        let mut d_evals = evaluate_points_batch(&mut d_coeffs, &mut d_domain, batch_size);
-        let mut h_evals: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_points_on_coset_batch(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
-        let mut h_evals_coset: Vec<Point> = (0..test_size * batch_size).map(|_| Point::zero()).collect();
-        d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
-
-        for i in 0..batch_size {
-            assert_eq!(h_evals_large[2 * i * test_size..(2 * i + 1) * test_size], h_evals[i * test_size..(i + 1) * test_size]);
-            assert_eq!(h_evals_large[(2 * i + 1) * test_size..(2 * i + 2) * test_size], h_evals_coset[i * test_size..(i + 1) * test_size]);
-        }
-        for i in 0..test_size * batch_size {
-            assert_ne!(h_evals[i], Point::zero());
-            assert_ne!(h_evals_coset[i], Point::zero());
-            assert_ne!(h_evals_large[2 * i], Point::zero());
-            assert_ne!(h_evals_large[2 * i + 1], Point::zero());
-        }
-    }
-
-    // testing matrix multiplication by comparing the result of FFT with the naive multiplication by the DFT matrix
-    #[test]
-    fn test_matrix_multiplication() {
-        let seed = None; // some value to fix the rng
-        let test_size = 1 << 5;
-        let rou = Fr::get_root_of_unity(test_size).unwrap();
-        let matrix_flattened: Vec<Scalar> = (0..test_size).map(
-            |row_num| { (0..test_size).map( 
-                |col_num| {
-                    let pow: [u64; 1] = [(row_num * col_num).try_into().unwrap()];
-                    Scalar::from_ark(Fr::pow(&rou, &pow).into_repr())
-                }).collect::<Vec<Scalar>>()
-            }).flatten().collect::<Vec<_>>();
-        let vector: Vec<Scalar> = generate_random_scalars(test_size, get_rng(seed));
-
-        let result = mult_matrix_by_vec(&matrix_flattened, &vector, 0);
-        let mut ntt_result = vector.clone();
-        ntt(&mut ntt_result, 0);
-        
-        // we don't use the same roots of unity as arkworks, so the results are permutations
-        // of one another and the only guaranteed fixed scalars are the following ones:
-        assert_eq!(result[0], ntt_result[0]);
-        assert_eq!(result[test_size >> 1], ntt_result[test_size >> 1]);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_scalar_mul() {
-        let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()];
-        let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        mult_sc_vec(&mut intoo, &expected, 0);
-        assert_eq!(intoo, expected);
-    }
-
-    #[test]
-    #[allow(non_snake_case)]
-    fn test_vec_point_mul() {
-        let dummy_one = Point {
-            x: Base::one(),
-            y: Base::one(),
-            z: Base::one(),
-        };
-
-        let mut inout = [dummy_one, dummy_one, Point::zero()];
-        let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()];
-        let expected = [dummy_one, Point::zero(), Point::zero()];
-        multp_vec(&mut inout, &scalars, 0);
-        assert_eq!(inout, expected);
-    }
-}
--- a/curve_parameters/bls12_377.json
+++ b/curve_parameters/bls12_377.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bls12_377",
-    "modolus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041,
-    "bit_count_p" : 253,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
-    "bit_count_q" : 377,
-    "limb_q" : 12,
-    "weierstrass_b" : 1,
-    "gen_x" : 81937999373150964239938255573465948239988671502647976594219695644855304257327692006745978603320413799295628339695,
-    "gen_y" : 241266749859715473739788878240585681733927191168601896383759122102112907357779751001206799952863815012735208165030
-}
--- a/curve_parameters/bls12_381.json
+++ b/curve_parameters/bls12_381.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bls12_381",
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
-    "bit_count_p" : 255,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
-    "bit_count_q" : 381,
-    "limb_q" : 12,
-    "weierstrass_b" : 4,
-    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
-}
--- a/curve_parameters/bn254.json
+++ b/curve_parameters/bn254.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bn254",
-    "modolus_p" : 21888242871839275222246405745257275088548364400416034343698204186575808495617,
-    "bit_count_p" : 254,
-    "limb_p" :  8,
-    "ntt_size" : 16,
-    "modolus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583,
-    "bit_count_q" : 254,
-    "limb_q" : 8,
-    "weierstrass_b" : 3,
-    "gen_x" : 1,
-    "gen_y" : 2
-}
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -1,203 +0,0 @@
-import json
-import math
-import os
-from sympy.ntheory import isprime, primitive_root
-import subprocess
-import random 
-import sys
-
-data = None
-with open(sys.argv[1]) as json_file:
-    data = json.load(json_file)
-
-curve_name = data["curve_name"]
-modolus_p = data["modolus_p"]
-bit_count_p = data["bit_count_p"]
-limb_p =  data["limb_p"]
-ntt_size = data["ntt_size"]
-modolus_q = data["modolus_q"]
-bit_count_q = data["bit_count_q"] 
-limb_q = data["limb_q"]
-weierstrass_b = data["weierstrass_b"]
-gen_x = data["gen_x"]
-gen_y = data["gen_y"]
-
-
-def to_hex(val, length):
-    x = str(hex(val))[2:]
-    if len(x) % 8 != 0:
-        x = "0" * (8-len(x) % 8) + x
-    if len(x) != length:
-        x = "0" * (length-len(x)) + x
-    n = 8
-    chunks = [x[i:i+n] for i in range(0, len(x), n)][::-1]
-    s = ""
-    for c in chunks:
-        s += "0x" + c + ", "
-    return s
-
-
-def get_root_of_unity(order: int) -> int:
-    assert (modolus_p - 1) % order == 0
-    return pow(5, (modolus_p - 1) // order, modolus_p)
-
-def create_field_parameters_struct(modulus, modulus_bits_count,limbs,ntt,size,name):
-    s = " struct "+name+"{\n"
-    s += "   static constexpr unsigned limbs_count = " + str(limbs)+";\n"
-    s += "   static constexpr storage<limbs_count> modulus = {"+to_hex(modulus,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<limbs_count> modulus_2 = {"+to_hex(modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<limbs_count> modulus_4 = {"+to_hex(modulus*4,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_wide = {"+to_hex(modulus,8*limbs*2)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared = {"+to_hex(modulus*modulus,8*limbs)[:-2]+"};\n"  
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_2 = {"+to_hex(modulus*modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_4 = {"+to_hex(modulus*modulus*2*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr unsigned modulus_bits_count = "+str(modulus_bits_count)+";\n"
-    m = int(math.floor(int(pow(2,2*modulus_bits_count) // modulus)))
-    s += "   static constexpr storage<limbs_count> m = {"+ to_hex(m,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> one = {"+ to_hex(1,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> zero = {"+ to_hex(0,8*limbs)[:-2] +"};\n"
-
-    if ntt:
-        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega"+str(k+1)+"= {"+ to_hex(omega,8*limbs)[:-2]+"};\n"
-        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega_inv"+str(k+1)+"= {"+ to_hex(pow(omega, -1, modulus),8*limbs)[:-2]+"};\n"
-        for k in range(size):
-            s += "   static constexpr storage<limbs_count> inv"+str(k+1)+"= {"+ to_hex(pow(int(pow(2,k+1)), -1, modulus),8*limbs)[:-2]+"};\n"  
-    s+=" };\n"   
-    return s
-
-def create_gen():
-    s = " struct group_generator {\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_x = {"+to_hex(gen_x,8*limb_q)[:-2]+ "};\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_y = {"+to_hex(gen_y,8*limb_q)[:-2]+ "};\n"
-    s+=" };\n" 
-    return s
-
-def get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b):
-    file_content = ""
-    file_content += "#pragma once\n#include \"../../utils/storage.cuh\"\n"
-    file_content += "namespace PARAMS_"+curve_name.upper()+"{\n"
-    file_content += create_field_parameters_struct(modolus_p,bit_count_p,limb_p,True,ntt_size,"fp_config")
-    file_content += create_field_parameters_struct(modolus_q,bit_count_q,limb_q,False,0,"fq_config")
-    file_content += " static constexpr unsigned weierstrass_b = " + str(weierstrass_b)+ ";\n"
-    file_content += create_gen()
-    file_content+="}\n"
-    return file_content
-
-
-# Create Cuda interface
-
-newpath = "./icicle-cuda/curves/"+curve_name 
-if not os.path.exists(newpath):
-    os.makedirs(newpath)
-
-fc = get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b)
-text_file = open("./icicle-cuda/curves/"+curve_name+"/params.cuh", "w")
-n = text_file.write(fc)
-text_file.close()
-
-with open("./icicle-cuda/curves/curve_template/lde.cu", "r") as lde_file:
-    content = lde_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/lde.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open("./icicle-cuda/curves/curve_template/msm.cu", "r") as msm_file:
-    content = msm_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/msm.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-
-with open("./icicle-cuda/curves/curve_template/ve_mod_mult.cu", "r") as ve_mod_mult_file:
-    content = ve_mod_mult_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle-cuda/curves/"+curve_name+"/ve_mod_mult.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-
-namespace = '#include "params.cuh"\n'+'''namespace CURVE_NAME_U {
-    typedef Field<PARAMS_CURVE_NAME_U::fp_config> scalar_field_t;\
-    typedef scalar_field_t scalar_t;\
-    typedef Field<PARAMS_CURVE_NAME_U::fq_config> point_field_t;
-    typedef Projective<point_field_t, scalar_field_t, PARAMS_CURVE_NAME_U::group_generator, PARAMS_CURVE_NAME_U::weierstrass_b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-}'''
-
-with open('./icicle-cuda/curves/'+curve_name+'/curve_config.cuh', 'w') as f:
-    f.write(namespace.replace("CURVE_NAME_U",curve_name.upper()))
-    
-    
-eq = '''
-#include <cuda.h>\n
-#include "curve_config.cuh"\n
-#include "../../primitives/projective.cuh"\n
-extern "C" bool eq_CURVE_NAME_L(CURVE_NAME_U::projective_t *point1, CURVE_NAME_U::projective_t *point2)
-{
-    return (*point1 == *point2);
-}'''
-
-with open('./icicle-cuda/curves/'+curve_name+'/projective.cu', 'w') as f:
-    f.write(eq.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-
-supported_operations = '''
-#include "projective.cu"
-#include "lde.cu"
-#include "msm.cu"
-#include "ve_mod_mult.cu"
-'''
-
-with open('./icicle-cuda/curves/'+curve_name+'/supported_operations.cu', 'w') as f:
-    f.write(supported_operations.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-    
-with open('./icicle-cuda/curves/index.cu', 'a') as f:
-    f.write('\n#include "'+curve_name.lower()+'/supported_operations.cu"')
-    
-
-
-# Create Rust interface and tests
-
-if limb_p == limb_q: 
-    with open("./src/curve_templates/curve_same_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
-        n = text_file.write(content)
-        text_file.close()
-else:
-    with open("./src/curve_templates/curve_different_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        content = content.replace("_limbs_q",str(limb_q * 8 * 4))
-        content = content.replace("limbs_q",str(limb_q))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
-        n = text_file.write(content)
-        text_file.close()
-
-with open("./src/curve_templates/test.rs", "r") as test_file:
-    content = test_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./src/test_"+curve_name+".rs", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open('./src/curves/mod.rs', 'a') as f:
-    f.write('\n pub mod ' + curve_name + ';')
-
-with open('./src/lib.rs', 'a') as f:
-    f.write('\npub mod ' + curve_name + ';')
--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -0,0 +1,23 @@
+# ZKContainer
+
+We recommend using [ZKContainer](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).
+
+In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`. 
+
+- File `Dockerfile` specifies how to build an image of a ZKContainer. 
+- File `devcontainer.json` enables running ZKContainer from Visual Studio Code.
+
+## Running ZKContainer from shell
+
+```sh
+docker build -t icicle-example-poseidon -f .devcontainer/Dockerfile .
+```
+
+To run the example interactively, start the container
+
+```sh
+docker run -it --rm --gpus all -v .:/icicle-example icicle-example-poseidon
+```
+
+Inside the container, run the commands for building the library for whichever [build system](../README.md#build-systems) you choose to use. 
--- a/examples/c++/msm/.devcontainer/Dockerfile
+++ b/examples/c++/msm/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /opt/icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/msm/.devcontainer/devcontainer.json
+++ b/examples/c++/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,21 @@
+{
+    "name": "Icicle Examples: msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python"
+			]
+		}
+	}
+}
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -0,0 +1,52 @@
+# Icicle example: Muli-Scalar Multiplication (MSM)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template function `MSM` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Concise Usage Explanation
+
+1. Select the curve
+2. Include an MSM template
+3. Configure MSM
+4. Call the template  
+
+```c++
+#define CURVE_ID 1
+#include "icicle/appUtils/msm/msm.cu"
+...
+msm::MSMConfig config = {...};
+...
+msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, size, config, &result);
+```
+
+In this example we use `BN254` curve (`CURVE_ID=1`). The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
+
+**Parameters:**
+
+The configuration is passed to the kernel as a structure of type `msm::MSMConfig`. Some of the most important fields are listed below:
+
+- `are_scalars_on_device`, `are_points_on_device`, `are_results_on_device`: location of the data
+
+- `is_async`: blocking vs. non-blocking kernel call
+
+- `large_bucket_factor`:  distinguishes between large bucket and normal bucket sizes. If there is a scalar distribution that is skewed heavily to a few values we can operate on those separately from the rest of the values. The ideal value here can vary by circuit (based on the distribution of scalars) but start with 10 and adjust to see if it improves performance.
+
+## Running the example
+
+- `cd` to your example directory
+- compile with  `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the parameters of MSM
+2. Generate random inputs on-host
+3. Configure and execute MSM using on-host data
+4. Copy inputs on-device
+5. Configure and execute MSM using on-device data
+6. Repeat the above steps for G2 points
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -0,0 +1,180 @@
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+#define G2_DEFINED
+#define CURVE_ID 1
+// include MSM template
+#include "appUtils/msm/msm.cu"
+using namespace curve_config;
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  int batch_size = 1;
+  std::cout << "Batch size: " << batch_size << std::endl;
+  unsigned msm_size = 1048576;
+  std::cout << "MSM size: " << msm_size << std::endl;
+  int N = batch_size * msm_size;
+
+  std::cout << "Part I: use G1 points" << std::endl;
+  
+  std::cout << "Generating random inputs on-host" << std::endl;
+  scalar_t* scalars = new scalar_t[N];
+  affine_t* points = new affine_t[N];
+  projective_t result;
+  scalar_t::RandHostMany(scalars, N);
+  projective_t::RandHostManyAffine(points, N);
+
+  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
+  // auto config = msm::DefaultMSMConfig();
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  msm::MSMConfig config = {
+    ctx,   // ctx
+    0,     // points_size
+    1,     // precompute_factor
+    0,     // c
+    0,     // bitsize
+    10,    // large_bucket_factor
+    1,     // batch_size
+    false, // are_scalars_on_device
+    false, // are_scalars_montgomery_form
+    false, // are_points_on_device
+    false, // are_points_montgomery_form
+    false, // are_results_on_device
+    false, // is_big_triangle
+    false, // is_async
+  };
+  config.batch_size = batch_size;
+  
+  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
+  // Create two events to time the MSM kernel
+  cudaStream_t stream = config.ctx.stream;
+  cudaEvent_t start, stop;
+  float time;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
+  // Execute the MSM kernel
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, msm_size, config, &result);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  // Print the result
+  std::cout << projective_t::to_affine(result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  scalar_t* scalars_d;
+  affine_t* points_d;
+  projective_t* result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&points_d, sizeof(affine_t) * N);
+  cudaMalloc(&result_d, sizeof(projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  // Create two events to time the MSM kernel
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
+  // Execute the MSM kernel
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars_d, points_d, msm_size, config, result_d);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  // Copy the result back to the host
+  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
+  // Print the result
+  std::cout << projective_t::to_affine(result) << std::endl;
+  // Free the device memory
+  cudaFree(scalars_d);
+  cudaFree(points_d);
+  cudaFree(result_d);
+  // Free the host memory, keep scalars for G2 example
+  delete[] points;
+
+  std::cout << "Part II: use G2 points" << std::endl;
+
+  std::cout << "Generating random inputs on-host" << std::endl;
+  // use the same scalars
+  g2_affine_t* g2_points = new g2_affine_t[N];
+  g2_projective_t::RandHostManyAffine(g2_points, N);
+
+  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
+  config.are_results_on_device = false;
+  config.are_scalars_on_device = false;
+  config.are_points_on_device = false;
+  g2_projective_t g2_result;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, g2_points, msm_size, config, &g2_result);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  g2_affine_t* g2_points_d;
+  g2_projective_t* g2_result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars_d, g2_points_d, msm_size, config, g2_result_d);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  cudaFree(scalars_d);
+  cudaFree(g2_points_d);
+  cudaFree(g2_result_d);
+  delete[] g2_points;
+  delete[] scalars;
+  cudaStreamDestroy(stream);
+  return 0;
+}
--- a/examples/c++/msm/run.sh
+++ b/examples/c++/msm/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/multiply/.devcontainer/Dockerfile
+++ b/examples/c++/multiply/.devcontainer/Dockerfile
@@ -0,0 +1,23 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use NVIDIA base image
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    nsight-systems-12.2 \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/c++/multiply/.devcontainer/devcontainer.json
+++ b/examples/c++/multiply/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "name": "Icicle Examples - Multiply",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-example,type=bind",
+    "workspaceFolder": "/icicle-example",
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-azuretools.vscode-docker",
+				"ms-vscode.cpptools-extension-pack"
+			]
+		}
+	}
+}
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -0,0 +1,41 @@
+# Icicle example: Multiplication
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+
+## Concise Usage Explanation
+
+Define a `CURVE_ID` and include curve configuration header:
+
+```c++
+#define CURVE_ID 1
+#include "curves/curve_config.cuh"
+```
+
+The values of `CURVE_ID` for different curves are in the above header. Multiplication is accelerated both for field scalars and point fields.
+
+```c++
+using namespace curve_config;
+scalar_t a;
+point_field_t b;
+```
+
+## Running the example
+
+- `cd` to your example directory
+- compile with `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the parameters for the example such as vector size 
+2. Generate random vectors on-host
+3. Copy them on-device
+4. Execute element-wise vector multiplication on-device
+5. Copy results on-host
+
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <nvml.h>
+
+#define CURVE_ID 1
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/vec_ops.cu"
+
+using namespace curve_config;
+
+// select scalar or point field
+//typedef scalar_t T;
+typedef point_field_t T;
+
+int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
+{
+  const bool is_on_device = true;
+  const bool is_montgomery = false;
+  cudaError_t err =  vec_ops::Mul<T,T>(vec_a, vec_b, n_elments, is_on_device, is_montgomery, ctx, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+
+int main(int argc, char** argv)
+{
+  const unsigned vector_size = 1 << 15;
+  const unsigned repetitions = 1 << 15;
+
+  cudaError_t err;
+  nvmlInit();
+  nvmlDevice_t device;
+  nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
+  std::cout << "Icicle-Examples: vector multiplications" << std::endl;
+  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
+    std::cout << "GPU Model: " << name << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU model name." << std::endl;
+  }
+  unsigned power_limit;
+  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
+
+  std::cout << "Vector size: " << vector_size << std::endl;
+  std::cout << "Repetitions: " << repetitions << std::endl;
+  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+
+  unsigned int baseline_power;
+  nvmlDeviceGetPowerUsage(device, &baseline_power);
+  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
+  unsigned baseline_temperature;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
+    std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // host data
+  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
+  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "Initializing vectors with random data" << std::endl;
+  T::RandHostMany(host_in1, vector_size);
+  T::RandHostMany(host_in2, vector_size);
+  // device data
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  T* device_in1;
+  T* device_in2;
+  T* device_out;
+
+  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  // copy from host to device
+  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  
+  std::cout << "Starting warm-up" << std::endl;
+  // Warm-up loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+  }
+
+  std::cout << "Starting benchmarking" << std::endl;
+  unsigned power_before;
+  nvmlDeviceGetPowerUsage(device, &power_before);
+  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
+            << " %" << std::endl;
+  unsigned temperature_before;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+  auto start_time = std::chrono::high_resolution_clock::now();
+  // Benchmark loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
+  unsigned power_after;
+  nvmlDeviceGetPowerUsage(device, &power_after);
+  std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
+            << " %" << std::endl;
+  unsigned temperature_after;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // Report performance in GMPS: Giga Multiplications Per Second
+  double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+  std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
+
+  // Optional: validate multiplication
+  T* host_out = (T*)malloc(vector_size * sizeof(T));
+
+  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+
+  // validate multiplication here...
+
+  // clean up and exit
+  free(host_in1); 
+  free(host_in2);
+  free(host_out);
+  cudaFree(device_in1);
+  cudaFree(device_in2);
+  cudaFree(device_out);
+  nvmlShutdown();
+  return 0;
+}
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/ntt/.devcontainer/Dockerfile
+++ b/examples/c++/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/ntt/.devcontainer/devcontainer.json
+++ b/examples/c++/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+		}
+	}
+}
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -0,0 +1,32 @@
+# Icicle example: Number-Theoretical Transform (NTT)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform.
+
+## Concise Usage Explanation
+
+```c++
+// Select the curve
+#define CURVE_ID 1
+// Include NTT template
+#include "appUtils/ntt/ntt.cu"
+using namespace curve_config;
+// Configure NTT
+ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
+// Call NTT
+ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
+```
+
+## Running the example
+
+- `cd` to your example directory
+- compile with  `./compile.sh`
+- run with `./run.sh`
+
+
+
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -0,0 +1,102 @@
+#include <chrono>
+#include <iostream>
+
+// select the curve
+#define CURVE_ID 1
+// include NTT template
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/large_ntt/kernel_ntt.cu"
+using namespace curve_config;
+
+// Operate on scalars
+typedef scalar_t S;
+typedef scalar_t E;
+
+void print_elements(const unsigned n, E* elements)
+{
+  for (unsigned i = 0; i < n; i++) {
+    std::cout << i << ": " << elements[i] << std::endl;
+  }
+}
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  // Lowest Harmonics
+  for (unsigned i = 0; i < ntt_size; i = i + 1) {
+    elements[i] = E::one();
+  }
+  // print_elements(ntt_size, elements );
+  // Highest Harmonics
+  for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
+    elements[i] = E::one();
+    elements[i + 1] = E::neg(scalar_t::one());
+  }
+  // print_elements(ntt_size, &elements[1*ntt_size] );
+}
+
+int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  int nof_errors = 0;
+  E amplitude = E::from((uint32_t)ntt_size);
+  // std::cout << "Amplitude: " << amplitude << std::endl;
+  // Lowest Harmonics
+  if (elements[0] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in lowest harmonics 0! " << std::endl;
+    // print_elements(ntt_size, elements );
+  } else {
+    std::cout << "Validated lowest harmonics" << std::endl;
+  }
+  // Highest Harmonics
+  if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in highest harmonics! " << std::endl;
+    // print_elements(ntt_size, &elements[1*ntt_size] );
+  } else {
+    std::cout << "Validated highest harmonics" << std::endl;
+  }
+  return nof_errors;
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  const unsigned log_ntt_size = 20;
+  std::cout << "Log2(NTT size): " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+  std::cout << "NTT size: " << ntt_size << std::endl;
+  const unsigned nof_ntts = 2;
+  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
+  const unsigned batch_size = nof_ntts * ntt_size;
+
+  std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
+  E* input;
+  input = (E*)malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, input);
+  E* output;
+  output = (E*)malloc(sizeof(E) * batch_size);
+
+  std::cout << "Running NTT with on-host data" << std::endl;
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  // Create a device context
+  auto ctx = device_context::get_default_device_context();
+  // the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
+  S rou = S{{0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1}};
+  ntt::InitDomain(rou, ctx);
+  // Create an NTTConfig instance
+  ntt::NTTConfig<S> config = ntt::DefaultNTTConfig<S>();
+  config.batch_size = nof_ntts;
+  config.ctx.stream = stream;
+  auto begin0 = std::chrono::high_resolution_clock::now();
+  cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
+  auto end0 = std::chrono::high_resolution_clock::now();
+  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
+  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
+  validate_output(ntt_size, nof_ntts, output);
+  cudaStreamDestroy(stream);
+  free(input);
+  free(output);
+  return 0;
+}
--- a/examples/c++/ntt/run.sh
+++ b/examples/c++/ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
+++ b/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
+++ b/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: polynomial multiplication",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+        }
+    }
+}
--- a/examples/c++/polynomial_multiplication/CMakeLists.txt
+++ b/examples/c++/polynomial_multiplication/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/polynomial_multiplication/compile.sh
+++ b/examples/c++/polynomial_multiplication/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -0,0 +1,114 @@
+#define CURVE_ID BLS12_381
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "curves/curve_config.cuh"
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/large_ntt/kernel_ntt.cu"
+#include "utils/vec_ops.cu"
+#include "utils/error_handler.cuh"
+#include <memory>
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_data;
+
+void random_samples(test_data* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++)
+    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
+}
+
+void incremental_values(test_scalar* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++) {
+    res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
+  }
+}
+
+// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
+// (1) allocate A,B on CPU. Randomize first half, zero second half
+// (2) allocate NttAGpu, NttBGpu on GPU
+// (3) calc NTT for A and for B from cpu to GPU
+// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
+// (5) INTT MulGpu inplace
+
+int main(int argc, char** argv)
+{
+  cudaEvent_t start, stop;
+  float measured_time;
+
+  int NTT_LOG_SIZE = 23;
+  int NTT_SIZE = 1 << NTT_LOG_SIZE;
+
+  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context
+
+  // init domain
+  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
+  ntt_config.ordering = ntt::Ordering::kNN; // TODO: use NR for forward and RN for backward
+  ntt_config.is_force_radix2 = (argc > 1) ? atoi(argv[1]) : false;
+
+  const char* ntt_alg_str = ntt_config.is_force_radix2 ? "Radix-2" : "Mixed-Radix";
+  std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";
+
+  CHK_IF_RETURN(cudaEventCreate(&start));
+  CHK_IF_RETURN(cudaEventCreate(&stop));
+
+  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  ntt::InitDomain(basic_root, ntt_config.ctx);
+
+  // (1) cpu allocation
+  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
+  auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
+  random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
+  random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros
+
+  test_data *GpuA, *GpuB, *MulGpu;
+
+  auto benchmark = [&](bool print, int iterations = 1) {
+    // start recording
+    CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      // (2) gpu input allocation
+      CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+
+      // (3) NTT for A,B from cpu to gpu
+      ntt_config.are_inputs_on_device = false;
+      ntt_config.are_outputs_on_device = true;
+      CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
+      CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
+
+      // (4) multiply A,B
+      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+      CHK_IF_RETURN(
+        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, true /*=is_on_device*/, false /*=is_montgomery*/, ntt_config.ctx, MulGpu));
+
+      // (5) INTT (in place)
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
+
+      CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
+    }
+
+    CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));
+
+    if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }
+
+    return CHK_LAST();
+  };
+
+  benchmark(false); // warmup
+  benchmark(true, 20);
+
+  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+
+  return 0;
+}
--- a/examples/c++/polynomial_multiplication/run.sh
+++ b/examples/c++/polynomial_multiplication/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+./build/example 1 # radix2
+./build/example 0 # mixed-radix
--- a/examples/rust/msm/.devcontainer/Dockerfile
+++ b/examples/rust/msm/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/msm/.devcontainer/devcontainer.json
+++ b/examples/rust/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "msm"
+version = "1.0.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = [ "g2" ] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+ark-bn254 = { version = "0.4.0", optional = true}
+ark-bls12-377 = { version = "0.4.0", optional = true}
+ark-ec = { version = "0.4.0", optional = true}
+clap = { version = "4.4.12", features = ["derive"] }
+
+[features]
+arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
+profile = []
+g2 = []
--- a/examples/rust/msm/README.md
+++ b/examples/rust/msm/README.md
@@ -0,0 +1,56 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Rust
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
+
+## Usage
+
+```rust
+msm(
+  /* Scalars input vector */ &scalars, 
+  /* Points input vector */ &points, 
+  /* MSMConfig reference */ &cfg, 
+  /* Projective point result */ &mut msm_results.as_slice()
+).unwrap();
+```
+In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+cargo run --release
+```
+
+You can add the `--feature arkworks,profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default sizes are 2^19 - 2^23. You can change this by passing the `--lower_bound_log_size <size> --upper_bound_log_size <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> cargo run --release -- -l 21 -u 24
+> ```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 10 ms | 11 ms | 21 ms | 39 ms | 77 ms |
+| Arkworks | 284 ms | 540 ms | 1,152 ms | 2,320 ms | 4,491 ms |
+
+### Single BLS12377 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 9 ms | 14 ms | 25 ms | 48 ms | 93 ms |
+| Arkworks | 490 ms | 918 ms | 1,861 ms | 3,624 ms | 7,191 ms |
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -0,0 +1,192 @@
+use icicle_bn254::curve::{
+    CurveCfg,
+    ScalarCfg,
+    G1Projective,
+    G2CurveCfg,
+    G2Projective 
+};
+
+use icicle_bls12_377::curve::{
+    CurveCfg as BLS12377CurveCfg,
+    ScalarCfg as BLS12377ScalarCfg,
+    G1Projective as BLS12377G1Projective
+};
+
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::HostOrDeviceSlice
+};
+
+use icicle_core::{
+    msm,
+    curve::Curve,
+    traits::GenerateRandom
+};
+
+#[cfg(feature = "arkworks")]
+use icicle_core::traits::ArkConvertible;
+
+#[cfg(feature = "arkworks")]
+use ark_bn254::{
+    G1Projective as Bn254ArkG1Projective,
+    G1Affine as Bn254G1Affine,
+    Fr as Bn254Fr
+};
+#[cfg(feature = "arkworks")]
+use ark_bls12_377::{
+    G1Projective as Bls12377ArkG1Projective,
+    G1Affine as Bls12377G1Affine,
+    Fr as Bls12377Fr
+};
+#[cfg(feature = "arkworks")]
+use ark_ec::scalar_mul::variable_base::VariableBaseMSM;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Lower bound (inclusive) of MSM sizes to run for
+    #[arg(short, long, default_value_t = 19)]
+    lower_bound_log_size: u8,
+
+    /// Upper bound of MSM sizes to run for
+    #[arg(short, long, default_value_t = 23)]
+    upper_bound_log_size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    let lower_bound = args.lower_bound_log_size;
+    let upper_bound = args.upper_bound_log_size;
+    println!("Running Icicle Examples: Rust MSM");
+    let upper_size = 1 << (upper_bound);
+    println!("Generating random inputs on host for bn254...");
+    let upper_points = CurveCfg::generate_random_affine_points(upper_size);
+    let g2_upper_points = G2CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars = ScalarCfg::generate_random(upper_size);
+    
+    println!("Generating random inputs on host for bls12377...");
+    let upper_points_bls12377 = BLS12377CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars_bls12377 = BLS12377ScalarCfg::generate_random(upper_size);
+
+    for i in lower_bound..=upper_bound { 
+        let log_size = i;
+        let size = 1 << log_size;
+        println!("---------------------- MSM size 2^{}={} ------------------------", log_size, size);
+        // Setting Bn254 points and scalars
+        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
+        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+        
+        // Setting bls12377 points and scalars
+        // let points_bls12377 = &upper_points_bls12377[..size];
+        let points_bls12377 =  HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());
+
+        println!("Configuring bn254 MSM...");
+        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let stream = CudaStream::create().unwrap();
+        let g2_stream = CudaStream::create().unwrap();
+        let mut cfg = msm::get_default_msm_config::<CurveCfg>();
+        let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
+        cfg.ctx.stream = &stream;
+        g2_cfg.ctx.stream = &g2_stream;
+        cfg.is_async = true;
+        g2_cfg.is_async = true;
+
+        println!("Configuring bls12377 MSM...");
+        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let stream_bls12377 = CudaStream::create().unwrap();
+        let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+        cfg_bls12377.ctx.stream = &stream_bls12377;
+        cfg_bls12377.is_async = true;
+
+        println!("Executing bn254 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+        #[cfg(feature = "profile")]
+        println!("ICICLE BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+
+
+        println!("Executing bls12377 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(&scalars_bls12377, &points_bls12377, &cfg_bls12377, &mut msm_results_bls12377 ).unwrap();
+        #[cfg(feature = "profile")]
+        println!("ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+        println!("Moving results to host..");
+        let mut msm_host_result = vec![G1Projective::zero(); 1];
+        let mut g2_msm_host_result = vec![G2Projective::zero(); 1];
+        let mut msm_host_result_bls12377 = vec![BLS12377G1Projective::zero(); 1];
+        
+        stream
+            .synchronize()
+            .unwrap();
+        g2_stream
+            .synchronize()
+            .unwrap();
+        msm_results
+            .copy_to_host(&mut msm_host_result[..])
+            .unwrap();
+        g2_msm_results
+            .copy_to_host(&mut g2_msm_host_result[..])
+            .unwrap();
+        println!("bn254 result: {:#?}", msm_host_result);
+        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
+        
+        stream_bls12377
+            .synchronize()
+            .unwrap();
+        msm_results_bls12377
+            .copy_to_host(&mut msm_host_result_bls12377[..])
+            .unwrap();
+        println!("bls12377 result: {:#?}", msm_host_result_bls12377);
+        
+        #[cfg(feature = "arkworks")]
+        {
+            println!("Checking against arkworks...");
+            let ark_points: Vec<Bn254G1Affine> = points.as_slice().iter().map(|&point| point.to_ark()).collect();
+            let ark_scalars: Vec<Bn254Fr> = scalars.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
+
+            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377.as_slice().iter().map(|point| point.to_ark()).collect();
+            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bn254_ark_msm_res = Bn254ArkG1Projective::msm(&ark_points, &ark_scalars).unwrap();
+            println!("Arkworks Bn254 result: {:#?}", bn254_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!("Ark BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bls12377_ark_msm_res = Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
+            println!("Arkworks Bls12377 result: {:#?}", bls12377_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!("Ark BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+            let bn254_icicle_msm_res_as_ark = msm_host_result[0].to_ark();
+            let bls12377_icicle_msm_res_as_ark = msm_host_result_bls12377[0].to_ark();
+
+            println!("Bn254 MSM is correct: {}", bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark));
+            println!("Bls12377 MSM is correct: {}", bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark));
+        }
+        
+        println!("Cleaning up bn254...");
+        stream
+            .destroy()
+            .unwrap();
+        println!("Cleaning up bls12377...");
+        stream_bls12377
+            .destroy()
+            .unwrap();
+        println!("");
+    }
+}
--- a/examples/rust/ntt/.devcontainer/Dockerfile
+++ b/examples/rust/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/ntt/.devcontainer/devcontainer.json
+++ b/examples/rust/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "ntt"
+version = "1.0.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }
+
+ark-ff = { version = "0.4.0" }
+ark-poly = "0.4.0"
+ark-std = "0.4.0"
+ark-bn254 = { version = "0.4.0" }
+ark-bls12-377 = { version = "0.4.0" }
+clap = { version = "4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/ntt/README.md
+++ b/examples/rust/ntt/README.md
@@ -0,0 +1,65 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Rust
+
+## Key-Takeaway
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
+
+## Usage
+
+```rust
+ntt::ntt(
+  /* input slice */ scalars.as_slice(),
+  /* NTT Direction */ ntt::NTTDir::kForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ ntt_results.as_slice()
+).unwrap();
+```
+
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT.
+2. Generate random inputs on-host
+3. Set up the domain.
+4. Configure NTT
+5. Execute NTT on-device
+6. Move the result on host
+7. Compare results with arkworks
+
+Running the example:
+
+```sh
+cargo run --release
+```
+
+You can add the `--feature profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `--size <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+cargo run --release -- -s 23
+```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 NTT
+
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.263 ms | 2.986 ms | 4.651 ms | 9.308 ms | 18.618 ms |
+| Arkworks | 138 ms | 290 ms | 611 ms | 1,295 ms | 2,715 ms |
+
+### Single BLS12377 NTT
+
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.272 ms | 2.893 ms | 4.728 ms | 9.211 ms | 18.319 ms |
+| Arkworks | 135 ms | 286 ms | 605 ms | 1,279 ms | 2,682 ms |
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -0,0 +1,157 @@
+use icicle_bn254::curve::{
+    ScalarCfg,
+    ScalarField,
+};
+
+use icicle_bls12_377::curve::{
+    ScalarCfg as BLS12377ScalarCfg,
+    ScalarField as BLS12377ScalarField
+};
+
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::HostOrDeviceSlice,
+    device_context::get_default_device_context
+};
+
+use icicle_core::{
+    ntt::{self, NTT},
+    traits::{GenerateRandom, FieldImpl}
+};
+
+use icicle_core::traits::ArkConvertible;
+
+use ark_bn254::Fr as Bn254Fr;
+use ark_bls12_377::Fr as Bls12377Fr;
+use ark_ff::FftField;
+use ark_poly::{EvaluationDomain, Radix2EvaluationDomain};
+use ark_std::cmp::{Ord, Ordering};
+use std::convert::TryInto;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of NTT to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    println!("Running Icicle Examples: Rust NTT");
+    let log_size = args.size;
+    let size = 1 << log_size;
+    println!("---------------------- NTT size 2^{}={} ------------------------", log_size, size);
+    // Setting Bn254 points and scalars
+    println!("Generating random inputs on host for bn254...");
+    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
+    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    
+    // Setting bls12377 points and scalars
+    println!("Generating random inputs on host for bls12377...");
+    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
+    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    
+    println!("Setting up bn254 Domain...");
+    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
+    let ctx = get_default_device_context();
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bn254 NTT...");
+    let stream = CudaStream::create().unwrap();
+    let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+    cfg.ctx.stream = &stream;
+    cfg.is_async = true;
+
+    println!("Setting up bls12377 Domain...");
+    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
+    // reusing ctx from above
+    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bls12377 NTT...");
+    let stream_bls12377 = CudaStream::create().unwrap();
+    let mut cfg_bls12377 = ntt::get_default_ntt_config::<BLS12377ScalarField>();
+    cfg_bls12377.ctx.stream = &stream_bls12377;
+    cfg_bls12377.is_async = true;
+
+    println!("Executing bn254 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BN254 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());
+
+    println!("Executing bls12377 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(&scalars_bls12377, ntt::NTTDir::kForward, &cfg_bls12377, &mut ntt_results_bls12377).unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());
+
+    println!("Moving results to host..");
+    stream
+        .synchronize()
+        .unwrap();
+    let mut host_bn254_results = vec![ScalarField::zero(); size];
+    ntt_results
+        .copy_to_host(&mut host_bn254_results[..])
+        .unwrap();
+    
+    stream_bls12377
+        .synchronize()
+        .unwrap();
+    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
+    ntt_results_bls12377
+        .copy_to_host(&mut host_bls12377_results[..])
+        .unwrap();
+    
+    println!("Checking against arkworks...");
+    let mut ark_scalars: Vec<Bn254Fr> = scalars.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
+    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
+    
+    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
+    let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
+    
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bn254_domain.fft_in_place(&mut ark_scalars);
+    #[cfg(feature = "profile")]
+    println!("Ark BN254 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bls12_377_domain.fft_in_place(&mut ark_scalars_bls12377);
+    #[cfg(feature = "profile")]
+    println!("Ark BLS12377 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+    host_bn254_results
+        .iter()
+        .zip(ark_scalars.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+    println!("Bn254 NTT is correct");
+    
+    host_bls12377_results
+        .iter()
+        .zip(ark_scalars_bls12377.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+
+    println!("Bls12377 NTT is correct");
+    
+    println!("Cleaning up bn254...");
+    stream
+        .destroy()
+        .unwrap();
+    println!("Cleaning up bls12377...");
+    stream_bls12377
+        .destroy()
+        .unwrap();
+    println!("");
+}
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,17 @@
+module github.com/ingonyama-zk/icicle
+
+go 1.20
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/kr/pretty v0.1.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
+
+require (
+	github.com/consensys/bavard v0.1.13
+	github.com/stretchr/testify v1.8.3
+	rsc.io/tmplfunc v0.0.3 // indirect
+)
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,20 @@
+github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
+github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
+github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
+rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA=
--- a/goicicle/Makefile
+++ b/goicicle/Makefile
@@ -0,0 +1,34 @@
+CUDA_ROOT_DIR = /usr/local/cuda
+NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
+CFLAGS = -Xcompiler -fPIC -std=c++17
+LDFLAGS = -shared
+FEATURES = -DG2_DEFINED
+
+TARGET_BN254 = libbn254.so
+TARGET_BW6761 = libbw6761.so
+TARGET_BLS12_381 = libbls12_381.so
+TARGET_BLS12_377 = libbls12_377.so
+
+VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761
+
+SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
+SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+
+all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
+
+$(TARGET_BN254): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
+
+$(TARGET_BW6761): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@
+
+$(TARGET_BLS12_381):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
+
+$(TARGET_BLS12_377):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
+
+clean:
+	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
--- a/goicicle/README.md
+++ b/goicicle/README.md
@@ -0,0 +1,82 @@
+# Golang Bindings
+
+To build the shared library:
+
+To build shared libraries for all supported curves.
+
+```
+make all
+```
+
+If you wish to build for a specific curve, for example bn254.
+
+```
+make libbn254.so
+```
+
+The current supported options are `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`. The resulting `.so` files are the compiled shared libraries for each curve.
+
+Finally to allow your system to find the shared libraries
+
+```
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/<path_to_shared_libs>
+```
+
+## Running golang tests
+
+To run the tests for curve bn254.
+
+```
+go test ./goicicle/curves/bn254 -count=1
+```
+
+## Cleaning up
+
+If you want to remove the compiled files
+
+```
+make clean
+```
+
+This will remove all shared libraries generated from the `make` file.
+
+# How do Golang bindings work?
+
+The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+
+1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+
+2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+    // Now you can call the C functions from the ICICLE libraries.
+    // Note that C function calls are prefixed with 'C.' in Go code.
+}
+```
+
+Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+# Common issues
+
+### Cannot find shared library
+
+In some cases you may encounter the following error, despite exporting the correct `LD_LIBRARY_PATH`.
+
+```
+/usr/local/go/pkg/tool/linux_amd64/link: running gcc failed: exit status 1
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+collect2: error: ld returned 1 exit status
+```
+
+This is normally fixed by exporting the path to the shared library location in the following way: `export CGO_LDFLAGS="-L/<path_to_shared_lib>/"`
--- a/goicicle/curves/bls12377/g1.go
+++ b/goicicle/curves/bls12377/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constructors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bls12_377(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBLS12_377
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12_377(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBLS12_377
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BLS12_377_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BLS12_377_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12_377(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12_377(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12_377(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/goicicle/curves/bls12377/g1_test.go
+++ b/goicicle/curves/bls12377/g1_test.go
@@ -0,0 +1,198 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBLS12_377One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBLS12_377Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBLS12_377ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 32)
+}
+
+func TestNewPointBLS12_377Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBLS12_377Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBLS12_377StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBLS12_377fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBLS12_377Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBLS12_377FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
+	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
--- a/goicicle/curves/bls12377/g2.go
+++ b/goicicle/curves/bls12377/g2.go
@@ -0,0 +1,102 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [6]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	X, Y ExtentionField
+}
+
+type G2Point struct {
+	X, Y, Z ExtentionField
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12_377(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
--- a/goicicle/curves/bls12377/g2_test.go
+++ b/goicicle/curves/bls12377/g2_test.go
@@ -0,0 +1,79 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
+	bytes := element.ToBytesLe()
+
+	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
--- a/goicicle/curves/bls12377/include/msm.h
+++ b/goicicle/curves/bls12377/include/msm.h
@@ -0,0 +1,98 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BLS12_377_MSM_H
+#define _BLS12_377_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bls12_377(
+  BLS12_377_projective_t* out, BLS12_377_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_377(
+  BLS12_377_projective_t* out,
+  BLS12_377_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_MSM_H */
--- a/goicicle/curves/bls12377/include/ntt.h
+++ b/goicicle/curves/bls12377/include/ntt.h
@@ -0,0 +1,195 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BLS12_377_NTT_H
+#define _BLS12_377_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+
+int ntt_cuda_bls12_377(BLS12_377_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_377(BLS12_377_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BLS12_377_scalar_t*
+build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bls12_377(BLS12_377_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_NTT_H */
--- a/goicicle/curves/bls12377/include/projective.h
+++ b/goicicle/curves/bls12377/include/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+bool projective_is_on_curve_bls12_377(BLS12_377_projective_t* point1);
+
+int random_scalar_bls12_377(BLS12_377_scalar_t* out);
+int random_projective_bls12_377(BLS12_377_projective_t* out);
+BLS12_377_projective_t* projective_zero_bls12_377();
+int projective_to_affine_bls12_377(BLS12_377_affine_t* out, BLS12_377_projective_t* point1);
+int projective_from_affine_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* point1);
+
+int random_g2_projective_bls12_377(BLS12_377_g2_projective_t* out);
+int g2_projective_to_affine_bls12_377(BLS12_377_g2_affine_t* out, BLS12_377_g2_projective_t* point1);
+int g2_projective_from_affine_bls12_377(BLS12_377_g2_projective_t* out, BLS12_377_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t* point1);
+
+bool eq_bls12_377(BLS12_377_projective_t* point1, BLS12_377_projective_t* point2);
+bool eq_g2_bls12_377(BLS12_377_g2_projective_t* point1, BLS12_377_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
--- a/Show More
+++ b/Show More