answers Roman's comments

2026-01-12 08:58:09 -05:00 · 2024-02-14 12:24:03 -05:00
386 changed files with 12105 additions and 42810 deletions
--- a/.codespellignore
+++ b/.codespellignore
@@ -1,5 +1,3 @@
 inout
 crate
 lmit
-mut
-uint
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -1,7 +1,5 @@
 golang:
-  - wrappers/golang/**/*.go'
-  - wrappers/golang/**/*.h'
-  - wrappers/golang/**/*.tmpl'
+  - goicicle/**/*.go'
  - go.mod
 rust:
  - wrappers/rust
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -15,6 +15,6 @@ jobs:
      - uses: codespell-project/actions-codespell@v2
        with:
          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
-          skip: ./**/target,./**/build,./docs/*.js,./docs/*.json
+          skip: ./**/target,./**/build
          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
          ignore_words_file: .codespellignore
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -1,46 +0,0 @@
-name: Deploy to GitHub Pages
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/*'
-
-permissions:
-  contents: write
-
-jobs:
-  deploy:
-    name: Deploy to GitHub Pages
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          path: 'repo'
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-          cache: npm
-          cache-dependency-path: ./repo/docs/package-lock.json
-
-      - name: Install dependencies
-        run: npm install --frozen-lockfile
-        working-directory: ./repo/docs
-
-      - name: Build website
-        run: npm run build
-        working-directory: ./repo/docs
-
-      - name: Copy CNAME to build directory
-        run: echo "dev.ingonyama.com" > ./build/CNAME
-        working-directory: ./repo/docs
-
-      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@v3
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./build
-          user_name: github-actions[bot]
-          user_email: 41898282+github-actions[bot]@users.noreply.github.com
-          working-directory: ./repo/docs
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -17,13 +17,9 @@ on:
      - main
      - dev

-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:  
  test-examples:
-    runs-on: [self-hosted, Linux, X64, icicle, examples]
+    runs-on: [self-hosted, Linux, X64, icicle] # ubuntu-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -80,22 +80,18 @@ jobs:
      # Building from the root workspace will build all members of the workspace by default
      run: cargo build --release --verbose

-  build-golang-linux:
-    name: Build Golang on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build CUDA libs
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      working-directory: ./wrappers/golang
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        ./build.sh ${{ matrix.curve }} ON
+  # TODO: Re-enable once Golang bindings for v1+ is finished
+  # build-golang-linux:
+  #   name: Build Golang on Linux
+  #   runs-on: [self-hosted, Linux, X64, icicle]
+  #   needs: check-changed-files
+  #   steps:
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Build CUDA libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle

  # TODO: Add once Golang make file supports building for Windows
  # build-golang-windows:
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -75,25 +75,20 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: ctest
  
-  test-golang-linux:
-    name: Test Golang on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    # strategy:
-    #   matrix:
-    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build CUDA libs
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # builds all curves with g2 ON
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        ./build.sh all ON
-    - name: Run Golang Tests
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        go test --tags=g2 ./... -count=1 -timeout 60m
+  # TODO: Re-enable once Golang bindings for v1+ is finished
+  # test-golang-linux:
+  #   name: Test Golang on Linux
+  #   runs-on: [self-hosted, Linux, X64, icicle]
+  #   needs: check-changed-files
+  #   steps:
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Build CUDA libs
+  #     working-directory: ./goicicle
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make libbn254.so
+  #   - name: Run Golang Tests
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: |
+  #       export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/goicicle
+  #       go test ./goicicle/curves/bn254 -count=1
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -1,29 +0,0 @@
-name: Test Deploy to GitHub Pages
-
-on:
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/*'
-
-jobs:
-  test-deploy:
-    name: Test deployment of docs webiste
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          path: 'repo'
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-          cache: npm
-          cache-dependency-path: ./repo/docs/package-lock.json
-
-      - name: Install dependencies
-        run: npm install --frozen-lockfile
-        working-directory: ./repo/docs
-      - name: Test build website
-        run: npm run build
-        working-directory: ./repo/docs
--- a/.gitignore
+++ b/.gitignore
@@ -17,5 +17,3 @@
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
 **/build
-**/icicle/appUtils/large_ntt/work
-icicle/appUtils/large_ntt/work/test_ntt
--- a/README.md
+++ b/README.md
@@ -43,12 +43,12 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP.
 - [GCC](https://gcc.gnu.org/install/download.html) version 9, latest version is recommended.
 - Any Nvidia GPU (which supports CUDA Toolkit version 12.0 or above).

-> [!NOTE]
-> It is possible to use CUDA 11 for cards which don't support CUDA 12, however we don't officially support this version and in the future there may be issues.
+> [!NOTE] 
+> It is possible to use CUDA 11 for cards which dont support CUDA 12, however we dont officially support this version and in the future there may be issues.

 ### Accessing Hardware

-If you don't have access to an Nvidia GPU we have some options for you. 
+If you don't have access to a Nvidia GPU we have some options for you. 

 Checkout [Google Colab](https://colab.google/). Google Colab offers a free [T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/) instance and ICICLE can be used with it, reference this guide for setting up your [Google Colab workplace][GOOGLE-COLAB-ICICLE].

@@ -71,7 +71,7 @@ Running ICICLE via Rust bindings is highly recommended and simple:
 - Clone this repo
  - go to our [Rust bindings][ICICLE-RUST]
  - Enter a [curve](./wrappers/rust/icicle-curves) implementation
-  - run `cargo build --release` to build or `cargo test` to build and execute tests
+  - run `cargo build --release` to build or `cargo test -- --test-threads=1` to build and execute tests

 In any case you would want to compile and run core icicle c++ tests, just follow these setps:
 - Clone this repo
@@ -114,7 +114,6 @@ This will ensure our custom hooks are run and will make it easier to follow our
 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
 - [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE

 ## Help & Support

@@ -143,10 +142,10 @@ See [LICENSE-MIT][LMIT] for details.
 [GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
 [ICICLE-CORE]: ./icicle/
 [ICICLE-RUST]: ./wrappers/rust/
-[ICICLE-GO]: ./wrappers/golang/
+[ICICLE-GO]: ./goicicle/
 [ICICLE-CORE-README]: ./icicle/README.md
 [ICICLE-RUST-README]: ./wrappers/rust/README.md
-[ICICLE-GO-README]: ./wrappers/golang/README.md
+[ICICLE-GO-README]: ./goicicle/README.md
 [documentation]: https://dev.ingonyama.com/icicle/overview
 [examples]: ./examples/

--- a/docs/.codespellignore
+++ b/docs/.codespellignore
@@ -1 +0,0 @@
-ICICLE
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,17 +0,0 @@
-.docusaurus/
-node_modules/
-yarn.lock
-.DS_Store
-
-# tex build artifacts
-.aux
-.bbl
-.bcf
-.blg
-.fdb_latexmk
-.fls
-.log
-.out
-.xml
-.gz
-.toc
--- a/docs/.prettierignore
+++ b/docs/.prettierignore
@@ -1,17 +0,0 @@
-.docusaurus/
-node_modules/
-yarn.lock
-.DS_Store
-
-# tex build artifacts
-.aux
-.bbl
-.bcf
-.blg
-.fdb_latexmk
-.fls
-.log
-.out
-.xml
-.gz
-.toc
--- a/docs/.prettierrc
+++ b/docs/.prettierrc
@@ -1,10 +0,0 @@
-{
-  "semi": false,
-  "singleQuote": true,
-  "trailingComma": "es5",
-  "printWidth": 80,
-  "tabWidth": 2,
-  "useTabs": false,
-  "proseWrap": "preserve",
-  "endOfLine": "lf"
-}
--- a/docs/CNAME
+++ b/docs/CNAME
@@ -1 +0,0 @@
-dev.ingonyama.com
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,39 +0,0 @@
-# Website
-
-This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator.
-
-### Installation
-
-```
-$ npm i
-```
-
-### Local Development
-
-```
-$ npm start
-```
-
-This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server.
-
-### Build
-
-```
-$ npm run build
-```
-
-This command generates static content into the `build` directory and can be served using any static contents hosting service.
-
-### Deployment
-
-Using SSH:
-
-```
-$ USE_SSH=true npm run deploy
-```
-
-Not using SSH:
-
-```
-$ GIT_USER=<Your GitHub username> npm run deploy
-```
--- a/docs/babel.config.js
+++ b/docs/babel.config.js
@@ -1,3 +0,0 @@
-module.exports = {
-  presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
-};
--- a/docs/docs/ZKContainers.md
+++ b/docs/docs/ZKContainers.md
@@ -1,12 +0,0 @@
-# ZKContainer
-
-We found that developing ZK provers with ICICLE gives developers the ability to scale ZK provers across many machines and many GPUs. To make this possible we developed the ZKContainer.
-
-## What is a ZKContainer?
-
-A ZKContainer is a standardized, optimized and secure docker container that we configured with ICICLE applications in mind. A developer using our ZKContainer can deploy an ICICLE application on a single machine or on a thousand GPU machines in a data center with minimal concerns regarding compatibility.
-
-ZKContainer has been used by Ingonyama clients to achieve scalability across large data centers.
-We suggest you read our [article](https://medium.com/@ingonyama/product-announcement-zk-containers-0e2a1f2d0a2b) regarding ZKContainer to understand the benefits of using them.
-
-![ZKContainer inside a ZK data center](../static/img/architecture-zkcontainer.png)
--- a/docs/docs/contributor-guide.md
+++ b/docs/docs/contributor-guide.md
@@ -1,23 +0,0 @@
-# Contributor's Guide
-
-We welcome all contributions with open arms. At Ingonyama we take a village approach, believing it takes many hands and minds to build a ecosystem.
-
-## Contributing to ICICLE
-
- Make suggestions or report bugs via [GitHub issues](https://github.com/ingonyama-zk/icicle/issues)
- Contribute to the ICICLE by opening a [pull request](https://github.com/ingonyama-zk/icicle/pulls).
- Contribute to our [documentation](https://github.com/ingonyama-zk/icicle/tree/main/docs) and [examples](https://github.com/ingonyama-zk/icicle/tree/main/examples).
- Ask questions on Discord
-
-### Opening a pull request
-
-When opening a [pull request](https://github.com/ingonyama-zk/icicle/pulls) please keep the following in mind.
-
- `Clear Purpose` - The pull request should solve a single issue and be clean of any unrelated changes.
- `Clear description` - If the pull request is for a new feature describe what you built, why you added it and how its best that we test it. For bug fixes please describe the issue and the solution.
- `Consistent style` - Rust and Golang code should be linted by the official linters (golang fmt and rust fmt) and maintain a proper style. For CUDA and C++ code we use [`clang-format`](https://github.com/ingonyama-zk/icicle/blob/main/.clang-format), [here](https://github.com/ingonyama-zk/icicle/blob/605c25f9d22135c54ac49683b710fe2ce06e2300/.github/workflows/main-format.yml#L46) you can see how we run it.
- `Minimal Tests` - please add test which cover basic usage of your changes .
-
-## Questions?
-
-Find us on [Discord](https://discord.gg/6vYrE7waPj).
--- a/docs/docs/grants.md
+++ b/docs/docs/grants.md
@@ -1,23 +0,0 @@
-# Ingonyama Grant programs
-
-Ingonyama understands the importance of supporting and fostering a vibrant community of researchers and builders to advance ZK. To encourage progress, we are not only developing in the open but also sharing resources with researchers and builders through various programs.
-
-## ICICLE ZK-GPU Ecosystem Grant
-
-Ingonyama invites researchers and practitioners to collaborate in advancing ZK acceleration. We are allocating $100,000 for grants to support this initiative.
-
-### Bounties & Grants
-
-Eligibility for grants includes:
-
-1. **Students**: Utilize ICICLE in your research.
-2. **Performance Improvement**: Enhance the performance of accelerated primitives in ICICLE.
-3. **Protocol Porting**: Migrate existing ZK protocols to ICICLE.
-4. **New Primitives**: Contribute new primitives to ICICLE.
-5. **Benchmarking**: Compare ZK benchmarks against ICICLE.
-
-## Contact
-
-For questions or submissions: [grants@ingonyama.com](mailto:grants@ingonyama.com)
-
-**Read the full article [here](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges)**
--- a/docs/docs/icicle/colab-instructions.md
+++ b/docs/docs/icicle/colab-instructions.md
@@ -1,138 +0,0 @@
-# Run ICICLE on Google Colab
-
-Google Colab lets you use a GPU free of charge, it's an Nvidia T4 GPU with 16 GB of memory, capable of running latest CUDA (tested on Cuda 12.2)
-As Colab is able to interact with shell commands, a user can also install a framework and load git repositories into Colab space.
-
-## Prepare Colab environment
-
-First thing to do in a notebook is to set the runtime type to a T4 GPU.
-
- in the upper corner click on the dropdown menu and select "change runtime type"
-
-![Change runtime](../../static/img/colab_change_runtime.png)
-
- In the window select "T4 GPU" and press Save
-
-![T4 GPU](../../static/img/t4_gpu.png)
-
-Installing Rust is rather simple, just execute the following command:
-
-```sh
-!apt install rustc cargo
-```
-
-To test the installation of Rust:
-
-```sh
-!rustc --version
-!cargo --version
-```
-
-A successful installation will result in a rustc and cargo version print, a faulty installation will look like this:
-
-```sh
-/bin/bash: line 1: rustc: command not found
-/bin/bash: line 1: cargo: command not found
-```
-
-Now we will check the environment:
-
-```sh
-!nvcc --version
-!gcc --version
-!cmake --version
-!nvidia-smi
-```
-
-A correct environment should print the result with no bash errors for `nvidia-smi` command and result in a **Teslt T4 GPU** type:
-
-```sh
-nvcc: NVIDIA (R) Cuda compiler driver
-Copyright (c) 2005-2023 NVIDIA Corporation
-Built on Tue_Aug_15_22:02:13_PDT_2023
-Cuda compilation tools, release 12.2, V12.2.140
-Build cuda_12.2.r12.2/compiler.33191640_0
-gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
-Copyright (C) 2021 Free Software Foundation, Inc.
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-cmake version 3.27.9
-
-CMake suite maintained and supported by Kitware (kitware.com/cmake).
-Wed Jan 17 13:10:18 2024
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
-| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
-|                                         |                      |                  N/A |
-+-----------------------------------------+----------------------+----------------------+
-
-+---------------------------------------------------------------------------------------+
-| Processes:                                                                            |
-|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
-|        ID   ID                                                             Usage      |
-|=======================================================================================|
-|  No running processes found                                                           |
-+---------------------------------------------------------------------------------------+
-```
-
-## Cloning ICICLE and running test
-
-Now we are ready to clone ICICE repository,
-
-```sh
-!git clone https://github.com/ingonyama-zk/icicle.git
-```
-
-We now can browse the repository and run tests to check the runtime environment:
-
-```sh
-!ls -la
-%cd icicle
-```
-
-Let's run a test!
-Navigate to icicle/wrappers/rust/icicle-curves/icicle-bn254 and run cargo test:
-
-```sh
-%cd wrappers/rust/icicle-curves/icicle-bn254/
-!cargo test --release
-```
-
-:::note
-
-Compiling the first time may take a while
-
-:::
-
-Test run should end like this:
-
-```sh
-running 15 tests
-test curve::tests::test_ark_point_convert ... ok
-test curve::tests::test_ark_scalar_convert ... ok
-test curve::tests::test_affine_projective_convert ... ok
-test curve::tests::test_point_equality ... ok
-test curve::tests::test_field_convert_montgomery ... ok
-test curve::tests::test_scalar_equality ... ok
-test curve::tests::test_points_convert_montgomery ... ok
-test msm::tests::test_msm ... ok
-test msm::tests::test_msm_skewed_distributions ... ok
-test ntt::tests::test_ntt ... ok
-test ntt::tests::test_ntt_arbitrary_coset ... ok
-test msm::tests::test_msm_batch has been running for over 60 seconds
-test msm::tests::test_msm_batch ... ok
-test ntt::tests::test_ntt_coset_from_subgroup ... ok
-test ntt::tests::test_ntt_device_async ... ok
-test ntt::tests::test_ntt_batch ... ok
-
-test result: ok. 15 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 99.39s
-```
-
-Viola, ICICLE in Colab!
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,3 +0,0 @@
-# Golang bindings
-
-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docs/icicle/image.png
+++ b/docs/docs/icicle/image.png
--- a/docs/docs/icicle/integrations.md
+++ b/docs/docs/icicle/integrations.md
@@ -1,97 +0,0 @@
-# ICICLE integrated provers
-
-ICICLE has been used by companies and projects such as [Celer Network](https://github.com/celer-network), [Consensys Gnark](https://github.com/Consensys/gnark), [EZKL](https://blog.ezkl.xyz/post/acceleration/) and others to accelerate their ZK proving pipeline.
-
-Many of these integrations have been a collaboration between Ingonyama and the integrating company. We have learned a lot about designing GPU based ZK provers.
-
-If you're interested in understanding these integrations better or learning how you can use ICICLE to accelerate your existing ZK proving pipeline this is the place for you.
-
-## A primer to building your own integrations
-
-Lets illustrate an ICICLE integration, so you can understand the core API and design overview of ICICLE.
-
-![ICICLE architecture](../../static/img/architecture-high-level.png)
-
-Engineers usually use a cryptographic library to implement their ZK protocols. These libraries implement efficient primitives which are used as building blocks for the protocol; ICICLE is such a library. The difference is that ICICLE is designed from the start to run on GPUs; the Rust and Golang APIs abstract away all low level CUDA details. Our goal was to allow developers with no GPU experience to quickly get started with ICICLE.
-
-A developer may use ICICLE with two main approaches in mind.
-
-1. Drop-in replacement approach.
-2. End-to-End GPU replacement approach.
-
-The first approach for GPU-accelerating your Prover with ICICLE is quick to implement, but it has limitations, such as reduced memory optimization and limited protocol tuning for GPUs. It's a solid starting point, but those committed to fully leveraging GPU acceleration should consider a more comprehensive approach.
-
-A End-to-End GPU replacement means performing the entire ZK proof on the GPU. This approach will reduce latency to a minimum and requires you to change the way you implement the protocol to be more GPU friendly. This approach will take full advantage of GPU acceleration. Redesigning your prover this way may take more engineering effort but we promise you that its worth it!
-
-## Using ICICLE integrated provers
-
-Here we cover how a developer can run existing circuits on ICICLE integrated provers.
-
-### Gnark
-
-[Gnark](https://github.com/Consensys/gnark) officially supports GPU proving with ICICLE. Currently only Groth16 on curve `BN254` is supported. This means that if you are currently using Gnark to write your circuits you can enjoy GPU acceleration without making many changes.
-
-:::info
-
-Currently ICICLE has been merged to Gnark [master branch](https://github.com/Consensys/gnark), however the [latest release](https://github.com/Consensys/gnark/releases/tag/v0.9.1) is from October 2023.
-
-:::
-
-Make sure your golang circuit project has `gnark` as a dependency and that you are using the master branch for now.
-
-```
-go get 	github.com/consensys/gnark@master
-```
-
-You should see two indirect dependencies added.
-
-```
-...
-	github.com/ingonyama-zk/icicle v0.1.0 // indirect
-	github.com/ingonyama-zk/iciclegnark v0.1.1 // indirect
-...
-```
-
-:::info
-As you may notice we are using ICICLE v0.1 here since golang bindings are only support in ICICLE v0.1 for the time being.
-:::
-
-To switch over to ICICLE proving, make sure to change the backend you are using, below is an example of how this should be done.
-
-```
-// toggle on
-proofIci, err := groth16.Prove(ccs, pk, secretWitness, backend.WithIcicleAcceleration())
-
-// toggle off
-proof, err := groth16.Prove(ccs, pk, secretWitness)
-```
-
-Now that you have enabled `WithIcicleAcceleration` backend simple change the way your run your circuits to:
-
-```
-go run -tags=icicle main.go
-```
-
-Your logs should look something like this if everything went as expected.
-
-```
-13:12:05 INF compiling circuit
-13:12:05 INF parsed circuit inputs nbPublic=1 nbSecret=1
-13:12:05 INF building constraint builder nbConstraints=3
-13:12:05 DBG precomputing proving key in GPU acceleration=icicle backend=groth16 curve=bn254 nbConstraints=3
-13:12:05 DBG constraint system solver done nbConstraints=3 took=0.070259
-13:12:05 DBG prover done acceleration=icicle backend=groth16 curve=bn254 nbConstraints=3 took=80.356684
-13:12:05 DBG verifier done backend=groth16 curve=bn254 took=1.843888
-```
-
-`acceleration=icicle` indicates that the prover is running in acceleration mode with ICICLE.
-
-You can reference the [Gnark docs](https://github.com/Consensys/gnark?tab=readme-ov-file#gpu-support) for further information.
-
-### Halo2
-
-[Halo2](https://github.com/zkonduit/halo2) fork integrated with ICICLE for GPU acceleration. This means that you can run your existing Halo2 circuits with GPU acceleration just by activating a feature flag.
-
-To enable GPU acceleration just enable `icicle_gpu` [feature flag](https://github.com/zkonduit/halo2/blob/3d7b5e61b3052680ccb279e05bdcc21dd8a8fedf/halo2_proofs/Cargo.toml#L102).
-
-This feature flag will seamlessly toggle on GPU acceleration for you.
--- a/docs/docs/icicle/introduction.md
+++ b/docs/docs/icicle/introduction.md
@@ -1,260 +0,0 @@
-# Getting started with ICICLE
-
-This guide is oriented towards developers who want to start writing code with the ICICLE libraries. If you just want to run your existing ZK circuits on GPU refer to [this guide](./integrations.md#using-icicle-integrations) please.
-
-## ICICLE repository overview
-
-![ICICLE API overview](../../static/img/apilevels.png)
-
-The diagram above displays the general architecture of ICICLE and the API layers that exist. The CUDA API, which we also call ICICLE Core, is the lowest level and is comprised of CUDA kernels which implement all primitives such as MSM as well as C++ wrappers which expose these methods for different curves.
-
-ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or you can implement a wrapper for it in any language.
-
-Based on this dependency architecture, the ICICLE repository has three main sections, each of which is independent from the other.
-
- ICICLE core
- ICICLE Rust bindings
- ICICLE Golang bindings
-
-### ICICLE Core
-
-[ICICLE core](https://github.com/ingonyama-zk/icicle/tree/main/icicle) contains all the low level CUDA code implementing primitives such as [points](https://github.com/ingonyama-zk/icicle/tree/main/icicle/primitives) and [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/msm). There also exists higher level C++ wrappers to expose the low level CUDA primitives ([example](https://github.com/ingonyama-zk/icicle/blob/c1a32a9879a7612916e05aa3098f76144de4109e/icicle/appUtils/msm/msm.cu#L1)).
-
-ICICLE Core would typically be compiled into a static library and used in a third party language such as Rust or Golang.
-
-### ICICLE Rust and Golang bindings
-
- [ICICLE Rust bindings](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust)
- [ICICLE Golang bindings](https://github.com/ingonyama-zk/icicle/tree/main/goicicle)
-
-These bindings allow you to easily use ICICLE in a Rust or Golang project. Setting up Golang bindings requires a bit of extra steps compared to the Rust bindings which utilize the `cargo build` tool.
-
-## Running ICICLE
-
-This guide assumes that you have a Linux or Windows machine with an Nvidia GPU installed. If you don't have access to an Nvidia GPU you can access one for free on [Google Colab](https://colab.google/).
-
-### Prerequisites
-
- NVCC (version 12.0 or newer)
- cmake 3.18 and above
- GCC - version 9 or newer is recommended.
- Any Nvidia GPU
- Linux or Windows operating system.
-
-#### Optional Prerequisites
-
- Docker, latest version.
- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html)
-
-If you don't wish to install these prerequisites you can follow this tutorial using a [ZK-Container](https://github.com/ingonyama-zk/icicle/blob/main/Dockerfile) (docker container). To learn more about using ZK-Containers [read this](../ZKContainers.md).
-
-### Setting up ICICLE and running tests
-
-The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run a ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and follow the same commands in your terminal.
-
-#### Setting up our environment
-
-Lets begin by cloning the ICICLE repository:
-
-```sh
-git clone https://github.com/ingonyama-zk/icicle
-```
-
-We will proceed to build the docker image [found here](https://github.com/ingonyama-zk/icicle/blob/main/Dockerfile):
-
-```sh
-docker build -t icicle-demo .
-docker run -it --runtime=nvidia --gpus all --name icicle_container icicle-demo
-```
-
- `-it` runs the container in interactive mode with a terminal.
- `--gpus all` Allocate all available GPUs to the container. You can also specify which GPUs to use if you don't want to allocate all.
- `--runtime=nvidia` Use the NVIDIA runtime, necessary for GPU support.
-
-To read more about these settings reference this [article](https://developer.nvidia.com/nvidia-container-runtime).
-
-If you accidentally close your terminal and want to reconnect just call:
-
-```sh
-docker exec -it icicle_container bash
-```
-
-Lets make sure that we have the correct CUDA version before proceeding
-
-```sh
-nvcc --version
-```
-
-You should see something like this
-
-```sh
-nvcc: NVIDIA (R) Cuda compiler driver
-Copyright (c) 2005-2023 NVIDIA Corporation
-Built on Tue_Aug_15_22:02:13_PDT_2023
-Cuda compilation tools, release 12.2, V12.2.140
-Build cuda_12.2.r12.2/compiler.33191640_0
-```
-
-Make sure the release version is at least 12.0.
-
-#### ICICLE Core
-
-ICICLE Core is found under [`<project_root>/icicle`](https://github.com/ingonyama-zk/icicle/tree/main/icicle). To build and run the tests first:
-
-```sh
-cd icicle
-```
-
-We are going to compile ICICLE for a specific curve
-
-```sh
-mkdir -p build
-cmake -S . -B build -DCURVE=bn254 -DBUILD_TESTS=ON
-cmake --build build
-```
-
-`-DBUILD_TESTS=ON` compiles the tests, without this flag `ctest` won't work.
-`-DCURVE=bn254` tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves).
-
-The output in `build` folder should include the static libraries for the compiled curve.
-
-:::info
-
-Make sure to only use `-DBUILD_TESTS=ON` for running tests as the archive output will only be available when `-DBUILD_TESTS=ON` is not supplied.
-
-:::
-
-To run the test
-
-```sh
-cd build
-ctest
-```
-
-#### ICICLE Rust
-
-The rust bindings work by first compiling the CUDA static libraries as seen [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs). The compilation of CUDA and the Rust library is all handled by the rust build toolchain.
-
-Similar to ICICLE Core here we also have to compile per curve.
-
-Lets compile curve `bn254`
-
-```sh
-cd wrappers/rust/icicle-curves/icicle-bn254
-```
-
-Now lets build our library
-
-```sh
-cargo build --release
-```
-
-This may take a couple of minutes since we are compiling both the CUDA and Rust code.
-
-To run the tests
-
-```sh
-cargo test
-```
-
-We also include some benchmarks
-
-```sh
-cargo bench
-```
-
-#### ICICLE Golang
-
-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
-
-### Running ICICLE examples
-
-ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle-examples) these examples cover some simple use cases using C++, rust and golang.
-
-In each example directory, ZK-container files are located in a subdirectory `.devcontainer`.
-
-```sh
-msm/
-├── .devcontainer
-   ├── devcontainer.json
-   └── Dockerfile
-```
-
-Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle-examples/blob/main/c%2B%2B/msm/example.cu).
-
-Clone the repository
-
-```sh
-git clone https://github.com/ingonyama-zk/icicle-examples.git
-cd icicle-examples
-```
-
-Enter the test directory
-
-```sh
-cd c++/msm
-```
-
-Now lets build our docker file and run the test inside it. Make sure you have installed the [optional prerequisites](#optional-prerequisites).
-
-```sh
-docker build -t icicle-example-msm -f .devcontainer/Dockerfile .
-```
-
-Lets start and enter the container
-
-```sh
-docker run -it --rm --gpus all -v .:/icicle-example icicle-example-msm
-```
-
-to run the example
-
-```sh
-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
-./build/example
-```
-
-You can now experiment with our other examples, perhaps try to run a rust or golang example next.
-
-## Writing new bindings for ICICLE
-
-Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
-
-static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
-
-Lets review the Golang bindings since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-  // Now you can call the C functions from the ICICLE libraries.
-  // Note that C function calls are prefixed with 'C.' in Go code.
-
-  out := (*C.BN254_projective_t)(unsafe.Pointer(p))
-  in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
-
-  C.projective_from_affine_bn254(out, in)
-}
-```
-
-The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
-
-If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
-
-### ICICLE Adapters
-
-One of the core ideas behind ICICLE is that developers can gradually accelerate their provers. Many protocols are written using other cryptographic libraries and completely replacing them may be complex and time consuming.
-
-Therefore we offer adapters for various popular libraries, these adapters allow us to convert points and scalars between different formats defined by various libraries. Here is a list:
-
-Golang adapters:
-
- [Gnark crypto adapter](https://github.com/ingonyama-zk/iciclegnark)
--- a/docs/docs/icicle/multi-gpu.md
+++ b/docs/docs/icicle/multi-gpu.md
@@ -1,64 +0,0 @@
-# Multi GPU with ICICLE
-
-:::info
-
-If you are looking for the Multi GPU API documentation refer here for [Rust](./rust-bindings/multi-gpu.md).
-
-:::
-
-One common challenge with Zero-Knowledge computation is managing the large input sizes. It's not uncommon to encounter circuits surpassing 2^25 constraints, pushing the capabilities of even advanced GPUs to their limits. To effectively scale and process such large circuits, leveraging multiple GPUs in tandem becomes a necessity.
-
-Multi-GPU programming involves developing software to operate across multiple GPU devices. Lets first explore different approaches to Multi-GPU programming then we will cover how ICICLE allows you to easily develop youR ZK computations to run across many GPUs.
-
-
-## Approaches to Multi GPU programming
-
-There are many [different strategies](https://github.com/NVIDIA/multi-gpu-programming-models) available for implementing multi GPU, however, it can be split into two categories.
-
-### GPU Server approach 
-
-This approach usually involves a single or multiple CPUs opening threads to read / write from multiple GPUs. You can think about it as a scaled up HOST - Device model.
-
-![alt text](image.png)
-
-This approach won't let us tackle larger computation sizes but it will allow us to compute multiple computations which we wouldn't be able to load onto a single GPU.
-
-For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel. 
-
-
-### Inter GPU approach
-
-This approach involves a more sophisticated approach to multi GPU computation. Using technologies such as [GPUDirect, NCCL, NVSHMEM](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-cwes1084/) and NVLink it's possible to combine multiple GPUs and split a computation among different devices.
-
-This approach requires redesigning the algorithm at the software level to be compatible with splitting amongst devices. In some cases, to lower latency to a minimum, special inter GPU connections would be installed on a server to allow direct communication between multiple GPUs.
-
-
-# Writing ICICLE Code for Multi GPUs
-
-The approach we have taken for the moment is a GPU Server approach; we assume you have a machine with multiple GPUs and you wish to run some computation on each GPU.
-
-To dive deeper and learn about the API check out the docs for our different ICICLE API
-
- [Rust Multi GPU APIs](./rust-bindings/multi-gpu.md)
- C++ Multi GPU APIs
-
-
-## Best practices 
-
- Never hardcode device IDs, if you want your software to take advantage of all GPUs on a machine use methods such as `get_device_count` to support arbitrary number of GPUs.
-
- Launch one CPU thread per GPU. To avoid [nasty errors](https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/) and hard to read code we suggest that for every GPU you create a dedicated thread. Within a CPU thread you should be able to launch as many tasks as you wish for a GPU as long as they all run on the same GPU id. This will make your code way more manageable, easy to read and performant.
-
-## ZKContainer support for multi GPUs
-
-Multi GPU support should work with ZK-Containers by simply defining which devices the docker container should interact with:
-
-```sh
-docker run -it --gpus '"device=0,2"' zk-container-image
-```
-
-If you wish to expose all GPUs 
-
-```sh
-docker run --gpus all zk-container-image
-```
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -1,64 +0,0 @@
-# What is ICICLE?
-
-[![Static Badge](https://img.shields.io/badge/Latest-v1.4.0-8a2be2)](https://github.com/ingonyama-zk/icicle/releases)
-
-![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue)
-
-
-
-[ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs. ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.
-
-ICICLE allows developers with minimal GPU experience to effortlessly accelerate their ZK application; from our experiments, even the most naive implementation may yield 10X improvement in proving times.
-
-ICICLE has been used by many leading ZK companies such as [Celer Network](https://github.com/celer-network), [Gnark](https://github.com/Consensys/gnark) and others to accelerate their ZK proving pipeline.
-
-## Dont have access to a GPU?
-
-We understand that not all developers have access to a GPU and we don't want this to limit anyone from developing with ICICLE.
-Here are some ways we can help you gain access to GPUs:
-
-### Grants
-
-At Ingonyama we are interested in accelerating the progress of ZK and cryptography. If you are an engineer, developer or an academic researcher we invite you to checkout [our grant program](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges). We will give you access to GPUs and even pay you to do your dream research!
-
-### Google Colab
-
-This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance, it's acquired with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.
-
-For an extensive guide on how to setup Google Colab with ICICLE refer to [this article](./colab-instructions.md).
-
-If none of these options are appropriate for you reach out to us on [telegram](https://t.me/RealElan) we will do our best to help you.
-
-### Vast.ai
-
-[Vast.ai](https://vast.ai/) is a global GPU marketplace where you can rent many different types of GPUs by the hour for [competitive pricing](https://vast.ai/pricing). They provide on-demand and interruptible rentals depending on your need or use case; you can learn more about their rental types [here](https://vast.ai/faq#rental-types).
-
-:::note
-
-If none of these options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to do our best to help you.
-
-:::
-
-## What can you do with ICICLE?
-
-[ICICLE](https://github.com/ingonyama-zk/icicle) can be used in the same way you would use any other cryptography library. While developing and integrating ICICLE into many proof systems, we found some use case categories:
-
-### Circuit developers
-
-If you are a circuit developer and are experiencing bottlenecks while running your circuits, an ICICLE integrated prover may be the solution.
-
-ICICLE has been integrated into a number of popular ZK provers including [Gnark prover](https://github.com/Consensys/gnark) and [Halo2](https://github.com/zkonduit/halo2). This means that you can enjoy GPU acceleration for your existing circuits immediately without writing a single line of code by simply switching on the GPU prover flag!
-
-### Integrating into existing ZK provers
-
-From our collaborations we have learned that its possible to accelerate a specific part of your prover to solve for a specific bottleneck.
-
-ICICLE can be used to accelerate specific parts of your prover without completely rewriting your ZK prover.
-
-### Developing your own ZK provers
-
-If your goal is to build a ZK prover from the ground up, ICICLE is an ideal tool for creating a highly optimized and scalable ZK prover. A key benefit of using GPUs with ICICLE is the ability to scale your ZK prover efficiently across multiple machines within a data center.
-
-### Developing proof of concepts
-
-ICICLE is also ideal for developing small prototypes. ICICLE has Golang and Rust bindings so you can easily develop a library implementing a specific primitive using ICICLE. An example would be develop a KZG commitment library using ICICLE.
--- a/docs/docs/icicle/primitives/image-1.png
+++ b/docs/docs/icicle/primitives/image-1.png
--- a/docs/docs/icicle/primitives/image-2.png
+++ b/docs/docs/icicle/primitives/image-2.png
--- a/docs/docs/icicle/primitives/image-3.png
+++ b/docs/docs/icicle/primitives/image-3.png
--- a/docs/docs/icicle/primitives/image.png
+++ b/docs/docs/icicle/primitives/image.png
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -1,162 +0,0 @@
-# MSM - Multi scalar multiplication
-
-MSM stands for Multi scalar multiplication, its defined as:
-
-<math xmlns="http://www.w3.org/1998/Math/MathML">
-  <mi>M</mi>
-  <mi>S</mi>
-  <mi>M</mi>
-  <mo stretchy="false">(</mo>
-  <mi>a</mi>
-  <mo>,</mo>
-  <mi>G</mi>
-  <mo stretchy="false">)</mo>
-  <mo>=</mo>
-  <munderover>
-    <mo data-mjx-texclass="OP" movablelimits="false">&#x2211;</mo>
-    <mrow data-mjx-texclass="ORD">
-      <mi>j</mi>
-      <mo>=</mo>
-      <mn>0</mn>
-    </mrow>
-    <mrow data-mjx-texclass="ORD">
-      <mi>n</mi>
-      <mo>&#x2212;</mo>
-      <mn>1</mn>
-    </mrow>
-  </munderover>
-  <msub>
-    <mi>a</mi>
-    <mi>j</mi>
-  </msub>
-  <msub>
-    <mi>G</mi>
-    <mi>j</mi>
-  </msub>
-</math>
-
-Where
-
-$G_j \in G$ - points from an Elliptic Curve group.
-
-$a_0, \ldots, a_n$ - Scalars
-
-$MSM(a, G) \in G$ - a single EC (elliptic curve) point
-
-In words, MSM is the sum of scalar and EC point multiplications. We can see from this definition that the core operations occurring are Modular Multiplication and Elliptic curve point addition. Its obvious that multiplication can be computed in parallel and then the products summed, making MSM inherently parallelizable.
-
-Accelerating MSM is crucial to a ZK protocol's performance due to the [large percent of run time](https://hackmd.io/@0xMonia/SkQ6-oRz3#Hardware-acceleration-in-action) they take when generating proofs.
-
-You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).
-
-# Using MSM
-
-## Supported curves
-
-MSM supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Supported algorithms
-
-Our MSM implementation supports two algorithms `Bucket accumulation` and `Large triangle accumulation`.
-
-### Bucket accumulation
-
-The Bucket Accumulation algorithm is a method of dividing the overall MSM task into smaller, more manageable sub-tasks. It involves partitioning scalars and their corresponding points into different "buckets" based on the scalar values.
-
-Bucket Accumulation can be more parallel-friendly because it involves dividing the computation into smaller, independent tasks, distributing scalar-point pairs into buckets and summing points within each bucket. This division makes it well suited for parallel processing on GPUs.
-
-#### When should I use Bucket accumulation?
-
-In scenarios involving large MSM computations with many scalar-point pairs, the ability to parallelize operations makes Bucket Accumulation more efficient. The larger the MSM task, the more significant the potential gains from parallelization.
-
-### Large triangle accumulation
-
-Large Triangle Accumulation is a method for optimizing MSM which focuses on reducing the number of point doublings in the computation. This algorithm is based on the observation that the number of point doublings can be minimized by structuring the computation in a specific manner.
-
-#### When should I use Large triangle accumulation?
-
-The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.
-
-
-### How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
-
-```rust
-...
-
-let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
-
-// is_big_triangle will determine which algorithm to use 
-cfg_bls12377.is_big_triangle = true;
-
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-...
-```
-
-You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
-
-## MSM Modes
-
-ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
-
-Batch MSM allows you to run many MSMs with a single API call, Single MSM will launch a single MSM computation.
-
-### Which mode should I use?
-
-This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.
-
-Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
-
-### How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
-
-```rust
-...
-
-let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
-
-...
-```
-
-In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
-
-
-In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
-
-```rust
-...
-
-let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-
-...
-```
-
-Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
-
-
-## Support for G2 group
-
-MSM also supports G2 group. 
-
-Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
-
-```rust
-... 
-
-let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
-
-msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
-
-...
-```
-
-Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -1,243 +0,0 @@
-# NTT - Number Theoretic Transform
-
-The Number Theoretic Transform (NTT) is a variant of the Fourier Transform used over finite fields, particularly those of integers modulo a prime number. NTT operates in a discrete domain and is used primarily in applications requiring modular arithmetic, such as cryptography and polynomial multiplication.
-
-NTT is defined similarly to the Discrete Fourier Transform (DFT), but instead of using complex roots of unity, it uses roots of unity within a finite field. The definition hinges on the properties of the finite field, specifically the existence of a primitive root of unity of order $N$ (where $N$ is typically a power of 2), and the modulo operation is performed with respect to a specific prime number that supports these roots.
-
-Formally, given a sequence of integers $a_0, a_1, ..., a_{N-1}$, the NTT of this sequence is another sequence of integers $A_0, A_1, ..., A_{N-1}$, computed as follows:
-
-$$
-A_k = \sum_{n=0}^{N-1} a_n \cdot \omega^{nk} \mod p
-$$
-
-where:
- $N$ is the size of the input sequence and is a power of 2,
- $p$ is a prime number such that $p = kN + 1$ for some integer $k$, ensuring that $p$ supports the existence of $N$th roots of unity,
- $\omega$ is a primitive $N$th root of unity modulo $p$, meaning $\omega^N \equiv 1 \mod p$ and no smaller positive power of $\omega$ is congruent to 1 modulo $p$,
- $k$ ranges from 0 to $N-1$, and it indexes the output sequence.
-
-The NTT is particularly useful because it enables efficient polynomial multiplication under modulo arithmetic, crucial for algorithms in cryptographic protocols, and other areas requiring fast modular arithmetic operations. 
-
-There exists also INTT which is the inverse operation of NTT. INTT can take as input an output sequence of integers from an NTT and reconstruct the original sequence.
-
-# Using NTT
-
-### Supported curves
-
-NTT supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-
-### Examples
-
- [Rust API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/rust/ntt/src/main.rs#L1)
-
- [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)
-
-## NTT API overview
-
-```rust
-pub fn ntt<F>(
-    input: &HostOrDeviceSlice<F>,
-    dir: NTTDir,
-    cfg: &NTTConfig<F>,
-    output: &mut HostOrDeviceSlice<F>,
-) -> IcicleResult<()>
-```
-
-`ntt:ntt` expects:
-
-`input` - buffer to read the inputs of the NTT from. <br/>
-`dir` - whether to compute forward or inverse NTT. <br/>
-`cfg` - config used to specify extra arguments of the NTT. <br/>
-`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
-
-The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
-
-### NTT Config
-
-```rust
-pub struct NTTConfig<'a, S> {
-    pub ctx: DeviceContext<'a>,
-    pub coset_gen: S,
-    pub batch_size: i32,
-    pub ordering: Ordering,
-    are_inputs_on_device: bool,    
-    are_outputs_on_device: bool,
-    pub is_async: bool,
-    pub ntt_algorithm: NttAlgorithm,
-}
-```
-
-The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
-
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
-
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
-
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
-
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
-
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
-
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
-
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
-`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
-
-#### Usage
-
-Example initialization with default settings:
-
-```rust
-let default_config = NTTConfig::default();
-```
-
-Customizing the configuration:
-
-```rust
-let custom_config = NTTConfig {
-    ctx: custom_device_context,
-    coset_gen: my_coset_generator,
-    batch_size: 10,
-    ordering: Ordering::kRN,
-    are_inputs_on_device: true,
-    are_outputs_on_device: true,
-    is_async: false,
-    ntt_algorithm: NttAlgorithm::MixedRadix,
-};
-```
-
-### Ordering
-
-The `Ordering` enum defines how inputs and outputs are arranged for the NTT operation, offering flexibility in handling data according to different algorithmic needs or compatibility requirements. It primarily affects the sequencing of data points for the transform, which can influence both performance and the compatibility with certain algorithmic approaches. The available ordering options are:
-
- **`kNN` (Natural-Natural):** Both inputs and outputs are in their natural order. This is the simplest form of ordering, where data is processed in the sequence it is given, without any rearrangement.
-
- **`kNR` (Natural-Reversed):** Inputs are in natural order, while outputs are in bit-reversed order. This ordering is typically used in algorithms that benefit from having the output in a bit-reversed pattern.
-
- **`kRN` (Reversed-Natural):** Inputs are in bit-reversed order, and outputs are in natural order. This is often used with the Cooley-Tukey FFT algorithm.
-
- **`kRR` (Reversed-Reversed):** Both inputs and outputs are in bit-reversed order.
-
- **`kNM` (Natural-Mixed):** Inputs are provided in their natural order, while outputs are arranged in a digit-reversed (mixed) order. This ordering is good for mixed radix NTT operations, where the mixed or digit-reversed ordering of outputs is a generalization of the bit-reversal pattern seen in simpler, radix-2 cases.
-
- **`kMN` (Mixed-Natural):** Inputs are in a digit-reversed (mixed) order, while outputs are restored to their natural order. This ordering would primarily be used for mixed radix NTT
-
-Choosing an algorithm is heavily dependent on your use case. For example Cooley-Tukey will often use `kRN` and Gentleman-Sande often uses `kNR`.
-
-### Modes
-
-NTT also supports two different modes `Batch NTT` and `Single NTT`
-
-Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.
-
-You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
-
-```rust
-let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
-cfg.batch_size = 10 // your ntt using this config will run in batch mode.
-```
-
-`batch_size=1` would keep our NTT in single NTT mode.
-
-Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
-
-**Single NTT Mode**
-
- Choose this mode when your application requires processing individual NTT operations in isolation.
-
-**Batch NTT Mode**
-
- Batch NTT mode can significantly reduce read/write as well as computation overhead by executing multiple NTT operations in parallel.
-
- Batch mode may also offer better utilization of computational resources (memory and compute).
-
-## Supported algorithms
-
-Our NTT implementation supports two algorithms `radix-2` and `mixed-radix`.
-
-### Radix 2
-
-At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-problems, leveraging the properties of "divide and conquer" to reduce the overall computational complexity. The algorithm operates on sequences whose lengths are powers of two.
-
-1. **Input Preparation:**
-   The input is a sequence of integers $a_0, a_1, \ldots, a_{N-1}, \text{ where } N$ is a power of two.
-
-2. **Recursive Decomposition:**
-   The algorithm recursively divides the input sequence into smaller sequences. At each step, it separates the sequence into even-indexed and odd-indexed elements, forming two subsequences that are then processed independently.
-
-3. **Butterfly Operations:**
-   The core computational element of the Radix-2 NTT is the "butterfly" operation, which combines pairs of elements from the sequences obtained in the decomposition step. 
-   
-   Each butterfly operation involves multiplication by a "twiddle factor," which is a root of unity in the finite field, and addition or subtraction of the results, all performed modulo the prime modulus.
-
-   $$
-    X_k = (A_k + B_k \cdot W^k) \mod p
-   $$
-
-   $X_k$ - The output of the butterfly operation for the $k$-th element
-
-   $A_k$ - an element from the even-indexed subset
-
-   $B_k$ - an element from the odd-indexed subset
-
-   $p$ - prime modulus
-
-   $k$ - The index of the current operation within the butterfly or the transform stage
-
-
-   The twiddle factors are precomputed to save runtime and improve performance.
-
-4. **Bit-Reversal Permutation:**
-   A final step involves rearranging the output sequence into the correct order. Due to the halving process in the decomposition steps, the elements of the transformed sequence are initially in a bit-reversed order. A bit-reversal permutation is applied to obtain the final sequence in natural order.
-
-### Mixed Radix
-
-The Mixed Radix NTT algorithm extends the concepts of the Radix-2 algorithm by allowing the decomposition of the input sequence based on various factors of its length. Specifically ICICLEs implementation splits the input into blocks of sizes 16,32,64 compared to radix2 which is always splitting such that we end with NTT of size 2. This approach offers enhanced flexibility and efficiency, especially for input sizes that are composite numbers, by leveraging the "divide and conquer" strategy across multiple radixes.
-
-The NTT blocks in Mixed Radix are implemented more efficiently based on winograd NTT but also optimized memory and register usage is better compared to Radix-2.
-
-Mixed Radix can reduce the number of stages required to compute for large inputs.
-
-1. **Input Preparation:**
-   The input to the Mixed Radix NTT is a sequence of integers $a_0, a_1, \ldots, a_{N-1}$, where $N$ is not strictly required to be a power of two. Instead, $N$ can be any composite number, ideally factorized into primes or powers of primes.
-
-2. **Factorization and Decomposition:**
-   Unlike the Radix-2 algorithm, which strictly divides the computational problem into halves, the Mixed Radix NTT algorithm implements a flexible decomposition approach which isn't limited to prime factorization. 
-   
-   For example, an NTT of size 256 can be decomposed into two stages of $16 \times \text{NTT}_{16}$, leveraging a composite factorization strategy rather than decomposing into eight stages of $\text{NTT}_{2}$. This exemplifies the use of composite factors (in this case, $256 = 16 \times 16$) to apply smaller NTT transforms, optimizing computational efficiency by adapting the decomposition strategy to the specific structure of $N$.
-
-3. **Butterfly Operations with Multiple Radixes:**
-   The Mixed Radix algorithm utilizes butterfly operations for various radix sizes. Each sub-transform involves specific butterfly operations characterized by multiplication with twiddle factors appropriate for the radix in question.
-
-   The generalized butterfly operation for a radix-$r$ element can be expressed as:
-
-   $$
-   X_{k,r} = \sum_{j=0}^{r-1} (A_{j,k} \cdot W^{jk}) \mod p
-   $$
-
-   where $X_{k,r}$ is the output of the $radix-r$ butterfly operation for the $k-th$ set of inputs, $A_{j,k}$ represents the $j-th$ input element for the $k-th$ operation, $W$ is the twiddle factor, and $p$ is the prime modulus.
-
-4. **Recombination and Reordering:**
-   After applying the appropriate butterfly operations across all decomposition levels, the Mixed Radix algorithm recombines the results into a single output sequence. Due to the varied sizes of the sub-transforms, a more complex reordering process may be required compared to Radix-2. This involves digit-reversal permutations to ensure that the final output sequence is correctly ordered.
-
-### Which algorithm should I choose ?
-
-Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Its also more suited for inputs which are power of 2 (e.g., 256, 512, 1024). Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
-
-Mixed radix on the other hand better for larger NTTs with larger input sizes which are not necessarily power of 2.
-
-Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.
-
-For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/774250926c00ffe84548bc7dd97aea5227afed7e/icicle/appUtils/ntt/ntt.cu#L474) which should choose the most efficient algorithm in most cases. 
-
-We still recommend you benchmark for your specific use case if you think a different configuration would yield better results.
--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -1,10 +0,0 @@
-# ICICLE Primitives
-
-This section of the documentation is dedicated to the ICICLE primitives, we will cover the usage and internal details of our primitives such as hashing algorithms, MSM and NTT.
-
-
-## Supported primitives
-
-
- [MSM](./msm)
- [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -1,226 +0,0 @@
-# Poseidon
-
-[Poseidon](https://eprint.iacr.org/2019/458.pdf) is a popular hash in the ZK ecosystem primarily because its optimized to work over large prime fields, a common setting for ZK proofs, thereby minimizing the number of multiplicative operations required.
-
-Poseidon has also been specifically designed to be efficient when implemented within ZK circuits, Poseidon uses far less constraints compared to other hash functions like Keccak or SHA-256 in the context of ZK circuits.
-
-Poseidon has been used in many popular ZK protocols such as Filecoin and [Plonk](https://drive.google.com/file/d/1bZZvKMQHaZGA4L9eZhupQLyGINkkFG_b/view?usp=drive_open).
-
-Our implementation of Poseidon is implemented in accordance with the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/).
-
-Let understand how Poseidon works.
-
-### Initialization
-
-Poseidon starts with the initialization of its internal state, which is composed of the input elements and some pregenerated constants. An initial round constant is added to each element of the internal state. Adding The round constants ensure the state is properly mixed from the outset.
-
-This is done to prevent collisions and to prevent certain cryptographic attacks by ensuring that the internal state is sufficiently mixed and unpredictable.
-
-![Alt text](image.png)
-
-### Applying full and partial rounds
-
-To generate a secure hash output, the algorithm goes through a series of "full rounds" and "partial rounds" as well as transformations between these sets of rounds.
-
-First full rounds => apply SBox and Round constants => partial rounds => Last full rounds => Apply SBox
-
-#### Full rounds
-
-![Alt text](image-1.png)
-
-**Uniform Application of S-Box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`, `α` can be different depending on the the implementation and user configuration.
-
-**Linear Transformation:** After applying the S-box, a linear transformation is performed on the state. This involves multiplying the state by a MDS (Maximum Distance Separable) Matrix. which further diffuses the transformations applied by the S-box across the entire state.
-
-**Addition of Round Constants:** Each element of the state is then modified by adding a unique round constant. These constants are different for each round and are precomputed as part of the hash function's initialization. The addition of round constants ensures that even minor changes to the input produce significant differences in the output.
-
-#### Partial Rounds
-
-**Selective Application of S-Box:** Partial rounds apply the S-box transformation to only one element of the internal state per round, rather than to all elements. This selective application significantly reduces the computational complexity of the hash function without compromising its security. The choice of which element to apply the S-box to can follow a specific pattern or be fixed, depending on the design of the hash function.
-
-**Linear Transformation and Round Constants:** A linear transformation is performed and round constants are added. The linear transformation in partial rounds can be designed to be less computationally intensive (this is done by using a sparse matrix) than in full rounds, further optimizing the function's efficiency.
-
-
-The user of Poseidon can often choose how many partial or full rounds he wishes to apply; more full rounds will increase security but degrade performance. The choice and balance is highly dependent on the use case.
-
-![Alt text](image-2.png)
-
-## Using Poseidon
-
-ICICLE Poseidon is implemented for GPU and parallelization is performed for each element of the state rather than for each state.
-What that means is we calculate multiple hash-sums over multiple pre-images in parallel, rather than going block by block over the input vector.
-
-So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elements of output. Which means each block would be of size 2 and that would result in 1024 Poseidon hashes being performed.
-
-### Supported API
-
-[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon), [`C++`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon)
-
-### Supported curves
-
-Poseidon supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-### Constants
-
-Poseidon is extremely customizable and using different constants will produce different hashes, security levels and performance results.
-
-We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.
-
-If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/blob/b6dded89cdef18348a5d4e2748b71ce4211c63ad/icicle/appUtils/poseidon/constants/generate_parameters.py#L1).
-
-Prerequisites:
- Install python 3
- `pip install poseidon-hash`
- `pip install galois==0.3.7`
- `pip install numpy`
-
-You will then need to modify the following values before running the script.
-
-```python
-# Modify these
-arity = 11 # we support arity 2, 4, 8 and 11.
-p = 0x73EDA753299D7D483339D80809A1D80553BDA402FFFE5BFEFFFFFFFF00000001 # bls12-381
-# p = 0x12ab655e9a2ca55660b44d1e5c37b00159aa76fed00000010a11800000000001 # bls12-377
-# p = 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001 # bn254
-# p = 0x1ae3a4617c510eac63b05c06ca1493b1a22d9f300f5138f1ef3622fba094800170b5d44300000008508c00000000001 # bw6-761
-prime_bit_len = 255
-field_bytes = 32
-
-...
-
-# primitive_element = None
-primitive_element = 7 # bls12-381
-# primitive_element = 22 # bls12-377
-# primitive_element = 5 # bn254
-# primitive_element = 15 # bw6-761
-```
-
-We only support `alpha = 5` so if you want to use another alpha for SBox please reach out on discord or open a github issue.
-
-### Rust API
-
-This is the most basic way to use the Poseidon API.
-
-```rust
-let test_size = 1 << 10;
-let arity = 2u32;
-let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
-let config = PoseidonConfig::default();
-
-let inputs = vec![F::one(); test_size * arity as usize];
-let outputs = vec![F::zero(); test_size];
-let mut input_slice = HostOrDeviceSlice::on_host(inputs);
-let mut output_slice = HostOrDeviceSlice::on_host(outputs);
-
-poseidon_hash_many::<F>(
-    &mut input_slice,
-    &mut output_slice,
-    test_size as u32,
-    arity as u32,
-    &constants,
-    &config,
-)
-.unwrap();
-```
-
-The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.
-
-
-```
-impl<'a> Default for PoseidonConfig<'a> {
-    fn default() -> Self {
-        let ctx = get_default_device_context();
-        Self {
-            ctx,
-            are_inputs_on_device: false,
-            are_outputs_on_device: false,
-            input_is_a_state: false,
-            aligned: false,
-            loop_state: false,
-            is_async: false,
-        }
-    }
-}
-```
-
-In the example above `load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
-
-```rust
-let ctx = get_default_device_context();
-    let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
-    let constants_file = PathBuf::from(cargo_manifest_dir)
-        .join("tests")
-        .join(format!("{}_constants.bin", field_prefix));
-    let mut constants_buf = vec![];
-    File::open(constants_file)
-        .unwrap()
-        .read_to_end(&mut constants_buf)
-        .unwrap();
-
-    let mut custom_constants = vec![];
-    for chunk in constants_buf.chunks(field_bytes) {
-        custom_constants.push(F::from_bytes_le(chunk));
-    }
-
-    let custom_constants = create_optimized_poseidon_constants::<F>(
-        arity as u32,
-        &ctx,
-        full_rounds_half,
-        partial_rounds,
-        &mut custom_constants,
-    )
-    .unwrap();
-```
-For more examples using different configurations refer here.
-
-## The Tree Builder
-
-The tree builder allows you to build Merkle trees using Poseidon. 
-
-You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.
-
-```rust
-let height = 20;
-let arity = 2;
-let leaves = vec![F::one(); 1 << (height - 1)];
-let mut digests = vec![F::zero(); merkle_tree_digests_len(height, arity)];
-
-let mut leaves_slice = HostOrDeviceSlice::on_host(leaves);
-
-let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
-
-let mut config = TreeBuilderConfig::default();
-config.keep_rows = 1;
-build_poseidon_merkle_tree::<F>(&mut leaves_slice, &mut digests, height, arity, &constants, &config).unwrap();
-
-println!("Root: {:?}", digests[0..1][0]);
-```
-
-Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig::default()`
-
- `keep_rows`: The number of rows which will be written to output, 0 will write all rows.
- `are_inputs_on_device`: Have the inputs been loaded to device memory ?
- `is_async`: Should the TreeBuilder run asynchronously? `False` will block the current CPU thread. `True` will require you call `cudaStreamSynchronize` or `cudaDeviceSynchronize` to retrieve the result.
-
-### Benchmarks 
-
-We ran the Poseidon tree builder on: 
-
-**CPU**: 12th Gen Intel(R) Core(TM) i9-12900K/
-
-**GPU**: RTX 3090 Ti
-
-**Tree height**: 30 (2^29 elements)
-
-The benchmarks include copying data from and to the device.
-
-
-| Rows to keep parameter      | Run time, Icicle | Supranational PC2
-| ----------- | ----------- | ----------- |  
-| 10          | 9.4 seconds       |    13.6 seconds
-| 20          | 9.5 seconds       |    13.6 seconds
-| 29          | 13.7 seconds       |    13.6 seconds
--- a/docs/docs/icicle/rust-bindings.md
+++ b/docs/docs/icicle/rust-bindings.md
@@ -1,57 +0,0 @@
-# Rust bindings
-
-Rust bindings allow you to use ICICLE as a rust library.
-
-`icicle-core` defines all interfaces, macros and common methods.
-
-`icicle-cuda-runtime` defines DeviceContext which can be used to manage a specific GPU as well as wrapping common CUDA methods.
-
-`icicle-curves` implements all interfaces and macros from icicle-core for each curve. For example icicle-bn254 implements curve bn254. Each curve has its own build script which will build the CUDA libraries for that curve as part of the rust-toolchain build.
-
-## Using ICICLE Rust bindings in your project
-
-Simply add the following to your `Cargo.toml`.
-
-```
-# GPU Icicle integration
-icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git" }
-icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git" }
-icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git" }
-```
-
-`icicle-bn254` being the curve you wish to use and `icicle-core` and `icicle-cuda-runtime` contain ICICLE utilities and CUDA wrappers.
-
-If you wish to point to a specific ICICLE branch add `branch = "<name_of_branch>"` or `tag = "<name_of_tag>"` to the ICICLE dependency. For a specific commit add `rev = "<commit_id>"`.
-
-When you build your project ICICLE will be built as part of the build command.
-
-# How do the rust bindings work?
-
-The rust bindings are just rust wrappers for ICICLE Core static libraries which can be compiled. We integrate the compilation of the static libraries into rusts toolchain to make usage seamless and easy. This is achieved by [extending rusts build command](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs).
-
-```rust
-use cmake::Config;
-use std::env::var;
-
-fn main() {
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=../../../../icicle");
-
-    let cargo_dir = var("CARGO_MANIFEST_DIR").unwrap();
-    let profile = var("PROFILE").unwrap();
-
-    let out_dir = Config::new("../../../../icicle")
-                .define("BUILD_TESTS", "OFF") //TODO: feature
-                .define("CURVE", "bn254")
-                .define("CMAKE_BUILD_TYPE", "Release")
-                .build_target("icicle")
-                .build();
-
-    println!("cargo:rustc-link-search={}/build", out_dir.display());
-
-    println!("cargo:rustc-link-lib=ingo_bn254");
-    println!("cargo:rustc-link-lib=stdc++");
-    // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    println!("cargo:rustc-link-lib=cudart");
-}
-```
--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -1,201 +0,0 @@
-# Multi GPU APIs
-
-To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
-
-Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
-
-## Device management API
-
-To streamline device management we offer as part of `icicle-cuda-runtime` package methods for dealing with devices.
-
-#### [`set_device`](https://github.com/ingonyama-zk/icicle/blob/e6035698b5e54632f2c44e600391352ccc11cad4/wrappers/rust/icicle-cuda-runtime/src/device.rs#L6)
-
-Sets the current CUDA device by its ID, when calling `set_device` it will set the current thread to a CUDA device.
-
-**Parameters:**
-
- `device_id: usize`: The ID of the device to set as the current device. Device IDs start from 0.
-
-**Returns:**
-
- `CudaResult<()>`: An empty result indicating success if the device is set successfully. In case of failure, returns a `CudaError`.
-
-**Errors:**
-
- Returns a `CudaError` if the specified device ID is invalid or if a CUDA-related error occurs during the operation.
-
-**Example:**
-
-```rust
-let device_id = 0; // Device ID to set
-match set_device(device_id) {
-    Ok(()) => println!("Device set successfully."),
-    Err(e) => eprintln!("Failed to set device: {:?}", e),
-}
-```
-
-#### [`get_device_count`](https://github.com/ingonyama-zk/icicle/blob/e6035698b5e54632f2c44e600391352ccc11cad4/wrappers/rust/icicle-cuda-runtime/src/device.rs#L10)
-
-Retrieves the number of CUDA devices available on the machine.
-
-**Returns:**
-
- `CudaResult<usize>`: The number of available CUDA devices. On success, contains the count of CUDA devices. On failure, returns a `CudaError`.
-
-**Errors:**
-
- Returns a `CudaError` if a CUDA-related error occurs during the retrieval of the device count.
-
-**Example:**
-
-```rust
-match get_device_count() {
-    Ok(count) => println!("Number of devices available: {}", count),
-    Err(e) => eprintln!("Failed to get device count: {:?}", e),
-}
-```
-
-#### [`get_device`](https://github.com/ingonyama-zk/icicle/blob/e6035698b5e54632f2c44e600391352ccc11cad4/wrappers/rust/icicle-cuda-runtime/src/device.rs#L15)
-
-Retrieves the ID of the current CUDA device.
-
-**Returns:**
-
- `CudaResult<usize>`: The ID of the current CUDA device. On success, contains the device ID. On failure, returns a `CudaError`.
-
-**Errors:**
-
- Returns a `CudaError` if a CUDA-related error occurs during the retrieval of the current device ID.
-
-**Example:**
-
-```rust
-match get_device() {
-    Ok(device_id) => println!("Current device ID: {}", device_id),
-    Err(e) => eprintln!("Failed to get current device: {:?}", e),
-}
-```
-
-## Device context API
-
-The `DeviceContext` is embedded into `NTTConfig`, `MSMConfig` and `PoseidonConfig`, meaning you can simply pass a `device_id` to your existing config and the same computation will be triggered on a different device.
-
-#### [`DeviceContext`](https://github.com/ingonyama-zk/icicle/blob/e6035698b5e54632f2c44e600391352ccc11cad4/wrappers/rust/icicle-cuda-runtime/src/device_context.rs#L11)
-
-Represents the configuration a CUDA device, encapsulating the device's stream, ID, and memory pool. The default device is always `0`.
-
-```rust
-pub struct DeviceContext<'a> {
-    pub stream: &'a CudaStream,
-    pub device_id: usize,
-    pub mempool: CudaMemPool,
-}
-```
-
-##### Fields
-
- **`stream: &'a CudaStream`**
-
-  A reference to a `CudaStream`. This stream is used for executing CUDA operations. By default, it points to a null stream CUDA's default execution stream.
-
- **`device_id: usize`**
-
-  The index of the GPU currently in use. The default value is `0`, indicating the first GPU in the system.
-
-  In some cases assuming `CUDA_VISIBLE_DEVICES` was configured, for example as `CUDA_VISIBLE_DEVICES=2,3,7` in the system with 8 GPUs - the `device_id=0` will correspond to GPU with id 2. So the mapping may not always be a direct reflection of the number of GPUs installed on a system.
-
- **`mempool: CudaMemPool`**
-
-  Represents the memory pool used for CUDA memory allocations. The default is set to a null pointer, which signifies the use of the default CUDA memory pool.
-
-##### Implementation Notes
-
- The `DeviceContext` structure is cloneable and can be debugged, facilitating easier logging and duplication of contexts when needed.
-
-
-#### [`DeviceContext::default_for_device(device_id: usize) -> DeviceContext<'static>`](https://github.com/ingonyama-zk/icicle/blob/e6035698b5e54632f2c44e600391352ccc11cad4/wrappers/rust/icicle-cuda-runtime/src/device_context.rs#L30)
-
-Provides a default `DeviceContext` with system-wide defaults, ideal for straightforward setups.
-
-#### Returns
-
-A `DeviceContext` instance configured with:
- The default stream (`null_mut()`).
- The default device ID (`0`).
- The default memory pool (`null_mut()`).
-
-#### Parameters
-
- **`device_id: usize`**: The ID of the device for which to create the context.
-
-#### Returns
-
-A `DeviceContext` instance with the provided `device_id` and default settings for the stream and memory pool.
-
-
-#### [`check_device(device_id: i32)`](https://github.com/vhnatyk/icicle/blob/eef6876b037a6b0797464e7cdcf9c1ecfcf41808/wrappers/rust/icicle-cuda-runtime/src/device_context.rs#L42)
-
-Validates that the specified `device_id` matches the ID of the currently active device, ensuring operations are targeted correctly.
-
-#### Parameters
-
- **`device_id: i32`**: The device ID to verify against the currently active device.
-
-#### Behavior
-
- **Panics** if the `device_id` does not match the active device's ID, preventing cross-device operation errors.
-
-#### Example
-
-```rust
-let device_id: i32 = 0; // Example device ID
-check_device(device_id);
-// Ensures that the current context is correctly set for the specified device ID.
-```
-
-
-## A Multi GPU example
-
-In this example we will display how you can
-
-1. Fetch the number of devices installed on a machine
-2. For every GPU launch a thread and set a active device per thread.
-3. Execute a MSM on each GPU
-
-
-
-```rust
-
-...
-
-let device_count = get_device_count().unwrap();
-
-(0..device_count)
-        .into_par_iter()
-        .for_each(move |device_id| {
-          set_device(device_id).unwrap();
-
-          // you can allocate points and scalars_d here
-
-          let mut cfg = MSMConfig::default_for_device(device_id);
-          cfg.ctx.stream = &stream;
-          cfg.is_async = true;
-          cfg.are_scalars_montgomery_form = true;
-          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
-
-          // collect and process results
-        })
-
-...
-```
-
-
-We use `get_device_count` to fetch the number of connected devices, device IDs will be `0...device_count-1`
-
-[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
-
-We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
-
-Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
-
-We finally call our `msm` method.
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,143 +0,0 @@
-# Vector Operations API
-
-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. 
-
-
-## Supported curves
-
-Vector operations are supported on the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Vector Operations Configuration
-
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
-
-### `VecOpsConfig`
-
-Defines configuration parameters for vector operations.
-
-```rust
-pub struct VecOpsConfig<'a> {
-    pub ctx: DeviceContext<'a>,
-    is_a_on_device: bool,
-    is_b_on_device: bool,
-    is_result_on_device: bool,
-    is_result_montgomery_form: bool,
-    pub is_async: bool,
-}
-```
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context for the operation, including the device ID and memory pool.
- **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
- **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
- **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
- **`is_result_montgomery_form`**: Determines if the result should be in Montgomery form.
- **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.
-
-### Default Configuration
-
-`VecOpsConfig` can be initialized with default settings tailored for a specific device:
-
-```
-let cfg = VecOpsConfig::default();
-```
-
-These are the default settings.
-
-```rust
-impl<'a> Default for VecOpsConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
-}
-
-impl<'a> VecOpsConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        VecOpsConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            is_a_on_device: false,
-            is_b_on_device: false,
-            is_result_on_device: false,
-            is_result_montgomery_form: false,
-            is_async: false,
-        }
-    }
-}
-```
-
-## Vector Operations
-
-Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
-
-### `VecOps` Trait
-
-```rust
-pub trait VecOps<F> {
-    fn add(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn sub(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn mul(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-}
-```
-
-#### Methods
-
- **`add`**: Computes the element-wise sum of two vectors.
- **`sub`**: Computes the element-wise difference between two vectors.
- **`mul`**: Performs element-wise multiplication of two vectors.
-
-### Argument Validation
-
-Before invoking any of the above vector operations, we always call `check_vec_ops_args`, to make sure that inputs `a` and `b` can be operated on with and that the results pointer can contain the result:
-
-```rust
-fn check_vec_ops_args<F>(a: &HostOrDeviceSlice<F>, b: &HostOrDeviceSlice<F>, result: &mut HostOrDeviceSlice<F>) {
-    if a.len() != b.len() || a.len() != result.len() {
-        panic!(
-            "left, right and output lengths {}; {}; {} do not match",
-            a.len(),
-            b.len(),
-            result.len()
-        );
-    }
-}
-```
-
-### Examples
-
-#### Addition of Scalars
-
-```rust
-
-```
-
-#### Subtraction of Scalars
-
-```rust
-
-```
-
-#### Multiplication of Scalars
-
-```rust
-
-```
--- a/docs/docs/icicle/supporting-additional-curves.md
+++ b/docs/docs/icicle/supporting-additional-curves.md
@@ -1,86 +0,0 @@
-# Supporting Additional Curves
-
-We understand the need for ZK developers to use different curves, some common some more exotic. For this reason we designed ICICLE to allow developers to add any curve they desire.
-
-## ICICLE Core
-
-ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.
-
-To add support a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
-
-We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.
-
-```
-...
-
-#define BN254     1
-#define BLS12_381 2
-#define BLS12_377 3
-#define BW6_761   4
-#define GRUMPKIN  5
-#define <curve_name> 6
-
-...
-```
-
-Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/icicle/curves/curve_config.cuh#L16-L29) accordingly.
-
-Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.
-
-```
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;<curve_name>)
-```
-
-## Bindings
-
-In order to support a new curves in the binding libraries you first must support it in ICICLE core.
-
-### Rust
-
-Create a new folder named `icicle-<curve_name>` under the [rust wrappers folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves). Your new directory should look like this.
-
-```
-└── rust
-    ├── icicle-curves
-        ├── icicle-<curve_name>
-    │   │   ├── Cargo.toml
-    │   │   ├── build.rs
-    │   │   └── src/
-    │   │       ├── curve.rs
-    │   │       ├── lib.rs
-    │   │       ├── msm/
-    │   │       │   └── mod.rs
-    │   │       └── ntt/
-    │   │           └── mod.rs
-```
-
-Lets look at [`ntt/mod.rs`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs) for example.
-
-```
-...
-
-extern "C" {
-    #[link_name = "bn254NTTCuda"]
-    fn ntt_cuda<'a>(
-        input: *const ScalarField,
-        size: usize,
-        is_inverse: bool,
-        config: &NTTConfig<'a, ScalarField>,
-        output: *mut ScalarField,
-    ) -> CudaError;
-
-    #[link_name = "bn254DefaultNTTConfig"]
-    fn default_ntt_config() -> NTTConfig<'static, ScalarField>;
-
-    #[link_name = "bn254InitializeDomain"]
-    fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
-}
-
-...
-```
-
-Here you would need to replace `bn254NTTCuda` with `<curve_name>NTTCuda`. Most of these changes are pretty straight forward. One thing you should pay attention to is limb sizes as these change for different curves. For example `BN254` [has limb size of 8](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs#L15) but for your curve this may be different.
-
-### Golang
-
-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docs/introduction.md
+++ b/docs/docs/introduction.md
@@ -1,47 +0,0 @@
---
-slug: /
-displayed_sidebar: GettingStartedSidebar
-title: ''
---
-
-# Welcome to Ingonyama's Developer Documentation
-
-Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications. Our focus is on democratizing access to compute intensive cryptography and making it accessible for developers to build on top of.
-
-Currently our flagship products are:
-
- **ICICLE**:
-  [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your ZK existing protocols in a matter of hours or implement your protocol from scratch on GPU.
-
---
-
-## Our current take on hardware acceleration
-
-We believe GPUs are as important for ZK as for AI.
-
- GPUs are a perfect match for ZK compute - around 97% of ZK protocol runtime is parallel by nature.
- GPUs are simple for developers to use and scale compared to other hardware platforms.
- GPUs are extremely competitive in terms of power / performance and price (3x cheaper compared to FPGAs).
- GPUs are popular and readily available.
-
-For a more in-depth understanding on this topic we suggest you read [our article on the subject](https://www.ingonyama.com/blog/revisiting-paradigm-hardware-acceleration-for-zero-knowledge-proofs).
-
-Despite our current focus on GPUs we are still hard at work developing a ZPU (ZK Processing Unit), with the goal of offering a programmable hardware platform for ZK. To read more about ZPUs we suggest you read this [article](https://medium.com/@ingonyama/zpu-the-zero-knowledge-processing-unit-f886a48e00e0).
-
-## ICICLE
-
-[ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs.
-ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.
-
-ICICLE is designed to be easy to use, developers don't have to touch a single line of CUDA code. Our Rust and Golang bindings allow your team to transition from CPU to GPU with minimal changes.
-
-Learn more about ICICLE and GPUs [here][ICICLE-OVERVIEW].
-
-## Get in Touch
-
-If you have any questions, ideas, or are thinking of building something in this space join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).
-
-Follow us on [Twitter](https://x.com/Ingo_zk) and [YouTube](https://www.youtube.com/@ingo_ZK) and sign up for our [mailing list](https://wkf.ms/3LKCbdj) to get our latest announcements.
-
-[ICICLE-OVERVIEW]: ./icicle/overview.md
-[Discord]: https://discord.gg/6vYrE7waPj
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -1,171 +0,0 @@
-// @ts-check
-const lightCodeTheme = require('prism-react-renderer/themes/github');
-const darkCodeTheme = require('prism-react-renderer/themes/dracula');
-const math = require('remark-math');
-const katex = require('rehype-katex');
-
-/** @type {import('@docusaurus/types').Config} */
-const config = {
-  title: 'Ingonyama Developer Documentation',
-  tagline: 'Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications.',
-  url: 'https://dev.ingonyama.com/',
-  baseUrl: '/',
-  onBrokenLinks: 'throw',
-  onBrokenMarkdownLinks: 'warn',
-  favicon: 'img/logo.png',
-  organizationName: 'ingonyama-zk',
-  projectName: 'developer-docs',
-  trailingSlash: false,
-  deploymentBranch: "main",
-  presets: [
-    [
-      'classic',
-      /** @type {import('@docusaurus/preset-classic').Options} */
-      ({
-        docs: {
-          showLastUpdateAuthor: true,
-          showLastUpdateTime: true,
-          routeBasePath: '/',
-          remarkPlugins: [math, require('mdx-mermaid')],
-          rehypePlugins: [katex],
-          sidebarPath: require.resolve('./sidebars.js'),
-          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
-        },
-        blog: {
-          remarkPlugins: [math, require('mdx-mermaid')],
-          rehypePlugins: [katex],
-          showReadingTime: true,
-          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
-        },
-        pages: {},
-        theme: {
-          customCss: require.resolve('./src/css/custom.css'),
-        },
-      }),
-    ],
-  ],
-
-  stylesheets: [
-    {
-      href: 'https://cdn.jsdelivr.net/npm/katex@0.13.24/dist/katex.min.css',
-      type: 'text/css',
-      integrity:
-        'sha384-odtC+0UGzzFL/6PNoE8rX/SPcQDXBJ+uRepguP4QkPCm2LBxH3FA3y+fKSiJ+AmM',
-      crossorigin: 'anonymous',
-    },
-  ],
-
-  scripts: [
-    {
-      src: 'https://plausible.io/js/script.js',
-      'data-domain':'ingonyama.com',
-      defer: true,
-    },
-  ],
-
-  themeConfig:
-    /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
-    ({
-      metadata: [
-        {name: 'twitter:card', content: 'summary_large_image'},
-        {name: 'twitter:site', content: '@Ingo_zk'},
-        {name: 'twitter:title', content: 'Ingonyama Developer Documentation'},
-        {name: 'twitter:description', content: 'Ingonyama is a next-generation semiconductor company focusing on Zero-Knowledge Proof hardware acceleration...'},
-        {name: 'twitter:image', content: 'https://dev.ingonyama.com/img/logo.png'},
-        // title
-        {name: 'og:title', content: 'Ingonyama Developer Documentation'},
-        {name: 'og:description', content: 'Ingonyama is a next-generation semiconductor company focusing on Zero-Knowledge Proof hardware acceleration...'},
-        {name: 'og:image', content: 'https://dev.ingonyama.com/img/logo.png'},
-      ],
-      hideableSidebar: true,
-      colorMode: {
-        defaultMode: 'dark',
-        respectPrefersColorScheme: false,
-      },
-      algolia: {
-        // The application ID provided by Algolia
-        appId: 'PZY4KJBBBK',
-  
-        // Public API key: it is safe to commit it
-        apiKey: '2cc940a6e0ef5c117f4f44e7f4e6e20b',
-  
-        indexName: 'ingonyama',
-  
-        // Optional: see doc section below
-        contextualSearch: true,
-  
-        // Optional: Specify domains where the navigation should occur through window.location instead on history.push. Useful when our Algolia config crawls multiple documentation sites and we want to navigate with window.location.href to them.
-        externalUrlRegex: 'external\\.com|domain\\.com',
-  
-        // Optional: Replace parts of the item URLs from Algolia. Useful when using the same search index for multiple deployments using a different baseUrl. You can use regexp or string in the `from` param. For example: localhost:3000 vs myCompany.com/docs
-        replaceSearchResultPathname: {
-          from: '/docs/', // or as RegExp: /\/docs\//
-          to: '/',
-        },
-  
-        // Optional: Algolia search parameters
-        searchParameters: {},
-  
-        // Optional: path for search page that enabled by default (`false` to disable it)
-        searchPagePath: 'search',
-      },
-      navbar: {
-        title: 'Ingonyama Developer Documentation',
-        logo: {
-          alt: 'Ingonyama Logo',
-          src: 'img/logo.png',
-        },
-        items: [
-          {
-            position: 'left',
-            label: 'Docs',
-            to: '/',
-          },
-          {
-            href: 'https://github.com/ingonyama-zk',
-            position: 'right',
-            label: 'GitHub',
-          },
-          {
-            href: 'https://www.ingonyama.com/ingopedia/glossary',
-            position: 'right',
-            label: 'Ingopedia',
-          },
-         {
-            type: 'dropdown',
-            position: 'right',
-            label: 'Community',
-            items: [
-              {
-                label: 'Discord',
-                href: 'https://discord.gg/6vYrE7waPj',
-              },
-              {
-                label: 'Twitter',
-                href: 'https://x.com/Ingo_zk',
-              },
-              {
-                label: 'YouTube',
-                href: 'https://www.youtube.com/@ingo_ZK'
-              },
-              {
-                label: 'Mailing List',
-                href: 'https://wkf.ms/3LKCbdj',
-              }
-            ]
-          },
-
-        ],
-      },
-      footer: {
-        copyright: `Copyright © ${new Date().getFullYear()} Ingonyama, Inc. Built with Docusaurus.`,
-      },
-      prism: {
-        theme: lightCodeTheme,
-        darkTheme: darkCodeTheme,
-      },
-      image: 'img/logo.png',
-    }),
-};
-
-module.exports = config;
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/package.json
+++ b/docs/package.json
@@ -1,48 +0,0 @@
-{
-  "name": "docusaurus",
-  "version": "0.0.0",
-  "private": true,
-  "description": "Ingonyama - developer docs",
-  "scripts": {
-    "docusaurus": "docusaurus",
-    "start": "docusaurus start",
-    "build": "docusaurus build",
-    "swizzle": "docusaurus swizzle",
-    "deploy": "docusaurus deploy",
-    "clear": "docusaurus clear",
-    "serve": "docusaurus serve",
-    "write-translations": "docusaurus write-translations",
-    "write-heading-ids": "docusaurus write-heading-ids",
-    "dev": "docusaurus start",
-    "format": "prettier --write '**/*.md'"
-  },
-  "dependencies": {
-    "@docusaurus/core": "2.0.0-beta.18",
-    "@docusaurus/preset-classic": "2.0.0-beta.18",
-    "@mdx-js/react": "^1.6.22",
-    "clsx": "^1.1.1",
-    "hast-util-is-element": "1.1.0",
-    "mdx-mermaid": "^1.2.2",
-    "mermaid": "^9.1.2",
-    "prism-react-renderer": "^1.3.1",
-    "react": "^17.0.2",
-    "react-dom": "^17.0.2",
-    "rehype-katex": "5",
-    "remark-math": "3"
-  },
-  "browserslist": {
-    "production": [
-      ">0.5%",
-      "not dead",
-      "not op_mini all"
-    ],
-    "development": [
-      "last 1 chrome version",
-      "last 1 firefox version",
-      "last 1 safari version"
-    ]
-  },
-  "devDependencies": {
-    "prettier": "^3.2.4"
-  }
-}
--- a/docs/sandbox.config.json
+++ b/docs/sandbox.config.json
@@ -1,10 +0,0 @@
-{
-  "infiniteLoopProtection": true,
-  "hardReloadOnChange": true,
-  "view": "browser",
-  "template": "docusaurus",
-  "node": "14",
-  "container": {
-    "node": "14"
-  }
-}
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -1,139 +0,0 @@
-module.exports = {
-  GettingStartedSidebar: [
-    {
-      type: "doc",
-      label: "Introduction",
-      id: "introduction",
-    },
-    {
-      type: "category",
-      label: "ICICLE",
-      link: {
-        type: `doc`,
-        id: 'icicle/overview',
-      },
-      collapsed: false,
-      items: [
-        {
-          type: "doc",
-          label: "Getting started",
-          id: "icicle/introduction"
-        },
-        {
-          type: "doc",
-          label: "ICICLE Provers",
-          id: "icicle/integrations"
-        },
-        {
-          type: "doc",
-          label: "Golang bindings",
-          id: "icicle/golang-bindings",
-        },
-        {
-          type: "category",
-          label: "Rust bindings",
-          link: {
-            type: `doc`,
-            id: "icicle/rust-bindings",
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "Multi GPU Support",
-              id: "icicle/rust-bindings/multi-gpu",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/rust-bindings/vec-ops",
-            },
-          ]
-        },
-        {
-          type: "category",
-          label: "Primitives",
-          link: {
-            type: `doc`,
-            id: 'icicle/primitives/overview',
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/primitives/msm",
-            },
-            {
-              type: "doc",
-              label: "Poseidon Hash",
-              id: "icicle/primitives/poseidon",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/primitives/ntt",
-            }
-          ],
-        },
-        {
-          type: "doc",
-          label: "Multi GPU Support",
-          id: "icicle/multi-gpu",
-        },
-        {
-          type: "doc",
-          label: "Supporting additional curves",
-          id: "icicle/supporting-additional-curves",
-        },
-        {
-          type: "doc",
-          label: "Google Colab Instructions",
-          id: "icicle/colab-instructions",
-        },
-      ]
-    },
-    {
-      type: "doc",
-      label: "ZK Containers",
-      id: "ZKContainers",
-    },
-    {
-      type: "doc",
-      label: "Ingonyama Grant program",
-      id: "grants",
-    },
-    {
-      type: "doc",
-      label: "Contributor guide",
-      id: "contributor-guide",
-    },
-    {
-      type: "category",
-      label: "Additional Resources",
-      collapsed: false,
-      items: [
-        {
-          type: "link",
-          label: "YouTube",
-          href: "https://www.youtube.com/@ingo_ZK"
-        },
-        {
-          type: "link",
-          label: "Ingonyama Blog",
-          href: "https://www.ingonyama.com/blog"
-        },
-        {
-          type: "link",
-          label: "Ingopedia",
-          href: "https://www.ingonyama.com/ingopedia"
-        },
-        {
-          href: 'https://github.com/ingonyama-zk',
-          type: "link",
-          label: 'GitHub',
-        }
-      ]
-    }
-  ],
-};
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@@ -1,59 +0,0 @@
-/**
- * Any CSS included here will be global. The classic template
- * bundles Infima by default. Infima is a CSS framework designed to
- * work well for content-centric websites.
- */
-
-/* You can override the default Infima variables here. */
-:root {
-  --ifm-color-primary: #FFCB00;
-  --ifm-color-primary-dark: #FFCB00;
-  --ifm-color-primary-darker: #FFCB00;
-  --ifm-color-primary-darkest: #FFCB00;
-  --ifm-color-primary-light: #FFCB00;
-  --ifm-color-primary-lighter: #FFCB00;
-  --ifm-color-primary-lightest: #FFCB00;
-  --ifm-code-font-size: 95%;
-}
-
-/* For readability concerns, you should choose a lighter palette in dark mode. */
-[data-theme='dark'] {
-  --ifm-color-primary: #FFCB00;
-  --ifm-color-primary-dark: #FFCB00;
-  --ifm-color-primary-darker:#FFCB00;
-  --ifm-color-primary-darkest: #FFCB00;
-  --ifm-color-primary-light:#FFCB00;
-  --ifm-color-primary-lighter: #FFCB00;
-  --ifm-color-primary-lightest: #FFCB00;
-}
-
-.docusaurus-highlight-code-line {
-  background-color: rgba(0, 0, 0, 0.1);
-  display: block;
-  margin: 0 calc(-1 * var(--ifm-pre-padding));
-  padding: 0 var(--ifm-pre-padding);
-}
-
-[data-theme='dark'] .docusaurus-highlight-code-line {
-  background-color: rgba(0, 0, 0, 0.3);
-}
-
-/* Mermaid elements must be changed to be visible in dark mode */
-[data-theme='dark'] .mermaid .messageLine0, .messageLine1 {
-      filter: invert(51%) sepia(84%) saturate(405%) hue-rotate(21deg) brightness(94%) contrast(91%) !important;
-}
-/* NOTE Must be a separate specification from the above or it won't toggle off */
-[data-theme='dark'] .mermaid .flowchart-link {
-      filter: invert(51%) sepia(84%) saturate(405%) hue-rotate(21deg) brightness(94%) contrast(91%) !important;
-}
-[data-theme='dark'] .mermaid .cluster-label {
-      filter: invert(51%) sepia(84%) saturate(405%) hue-rotate(21deg) brightness(94%) contrast(91%) !important;
-}
-[data-theme='dark'] .mermaid .messageText {
-      stroke:none !important; fill:white !important;
-}
-
-/* Our additions */
-.anchor {
-  scroll-margin-top: 50pt;
-}
--- a/docs/static/.nojekyll
+++ b/docs/static/.nojekyll
--- a/docs/static/img/apilevels.png
+++ b/docs/static/img/apilevels.png
--- a/docs/static/img/architecture-high-level.png
+++ b/docs/static/img/architecture-high-level.png
--- a/docs/static/img/architecture-overview.png
+++ b/docs/static/img/architecture-overview.png
--- a/docs/static/img/architecture-zkcontainer.png
+++ b/docs/static/img/architecture-zkcontainer.png
--- a/docs/static/img/colab_change_runtime.png
+++ b/docs/static/img/colab_change_runtime.png
--- a/docs/static/img/logo.png
+++ b/docs/static/img/logo.png
--- a/docs/static/img/t4_gpu.png
+++ b/docs/static/img/t4_gpu.png
--- a/examples/c++/Poseidon-hash/CMakeLists.txt
+++ b/examples/c++/Poseidon-hash/CMakeLists.txt
--- a/examples/c++/Poseidon-hash/README.md
+++ b/examples/c++/Poseidon-hash/README.md
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
--- a/examples/c++/Poseidon-hash/example.cu
+++ b/examples/c++/Poseidon-hash/example.cu
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ b/examples/c++/multi-gpu-poseidon/run.sh
--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(icicle LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
-add_executable(
-  example
-  example.cu
-)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -1,52 +0,0 @@
-# Icicle example: using multiple GPU to hash large dataset
-
-## Best-Practices
-
-This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
-
-## Key-Takeaway
-
-Use `device_context::DeviceContext` variable to select GPU to use. 
-Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
-
-## Concise Usage Explanation
-
-1. Include c++ threads
-
-```c++
-#include <thread>
-```
-
-2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
-
-```c++
-void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
-```
-
-3. Initialize device contexts for different GPUs
-
-```c++
-device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-ctx0.device_id=0;
-device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-ctx1.device_id=1;
-``` 
-
-4. Finally, spawn the threads and wait for their completion
-
-```c++
-std::thread thread0(threadPoseidon, ctx0, ...);
-std::thread thread1(threadPoseidon, ctx1, ...);
-thread0.join();
-thread1.join();
-```
-
-## What's in the example
-
-This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
-
-1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
-2. Hash two partitions in parallel on two GPUs
-3. Hash two partitions in series on one GPU
-4. Compare execution times
-
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -1,148 +0,0 @@
-#include <iostream>
-#include <thread>
-#include <chrono>
-
-#include <nvml.h>
-
-// select the curve
-#define CURVE_ID 2
-#include "appUtils/poseidon/poseidon.cu"
-#include "utils/error_handler.cuh"
-
-using namespace poseidon;
-using namespace curve_config;
-
-void checkCudaError(cudaError_t error) {
-    if (error != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
-        // Handle the error, e.g., exit the program or throw an exception.
-    }
-}
-
-// these global constants go into template calls
-const int size_col = 11;
-
-// this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    // CHK_IF_RETURN(); I can't use it in a standard thread function
-    PoseidonConfig column_config = {
-        ctx,   // ctx
-        false, // are_inputes_on_device
-        false, // are_outputs_on_device
-        false, // input_is_a_state
-        false, // aligned
-        false, // loop_state
-        false, // is_async
-        };
-    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
-    checkCudaError(err);
-}
-
-using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-
-#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
-    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
-    exit(EXIT_FAILURE); \
-}
-
-int main() {
-    const unsigned size_row = (1<<30);
-    const unsigned nof_partitions = 64;
-    const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to reuse for different partitions
-    const uint32_t size_layers = size_col * size_partition;
-    
-    nvmlInit();
-    unsigned int deviceCount;
-    nvmlDeviceGetCount(&deviceCount);
-    std::cout << "Available GPUs: " << deviceCount << std::endl;
-
-    for (unsigned int i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        nvmlMemory_t memory;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-        nvmlDeviceGetHandleByIndex(i, &device);
-        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        nvmlDeviceGetMemoryInfo(device, &memory);
-        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
-    }
-
-    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
-    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
-
-    //===============================================================================
-    // Key: multiple devices are supported by device context
-    //===============================================================================
-
-    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-    ctx0.device_id=0;
-    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-    ctx1.device_id=1;
-    
-    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers0);
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers1);
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
-
-    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash0);
-    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash1);
-
-    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
-
-    std::cout << "Parallel execution of Poseidon threads" << std::endl;
-    START_TIMER(parallel);
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
-
-    // Wait for the threads to finish
-    thread0.join();
-    thread1.join();
-    END_TIMER(parallel,"2 GPUs");
-    std::cout << "Output Data from Thread 0: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 1: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    std::cout << "Sequential execution of Poseidon threads" << std::endl;
-    START_TIMER(sequential);
-    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    thread2.join();
-    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
-    thread3.join();
-    END_TIMER(sequential,"1 GPU");
-    std::cout << "Output Data from Thread 2: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 3: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    nvmlShutdown();
-    return 0;
-}
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -10,15 +10,15 @@

 using namespace curve_config;

-typedef scalar_t T;
+// select scalar or point field
+//typedef scalar_t T;
+typedef point_field_t T;

 int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
-  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
-  config.is_a_on_device = true;
-  config.is_b_on_device = true;
-  config.is_result_on_device = true;
-  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
+  const bool is_on_device = true;
+  const bool is_montgomery = false;
+  cudaError_t err =  vec_ops::Mul<T,T>(vec_a, vec_b, n_elments, is_on_device, is_montgomery, ctx, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -16,11 +16,10 @@ We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to s
 // Include NTT template
 #include "appUtils/ntt/ntt.cu"
 using namespace curve_config;
-using namespace ntt;
 // Configure NTT
-NTTConfig<S> config=DefaultNTTConfig<S>();
+ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
 // Call NTT
-NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
 ```

 ## Running the example
@@ -29,10 +28,5 @@ NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
 - compile with  `./compile.sh`
 - run with `./run.sh`

-## What's in the example

-1. Define the size of the example
-2. Initialize input
-3. Run Radix2 NTT
-4. Run MixedRadix NTT
-5. Validate the data output
+
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -5,32 +5,28 @@
 #define CURVE_ID 1
 // include NTT template
 #include "appUtils/ntt/ntt.cu"
-#include "appUtils/ntt/kernel_ntt.cu"
 using namespace curve_config;
-using namespace ntt;

 // Operate on scalars
 typedef scalar_t S;
 typedef scalar_t E;

-void print_elements(const unsigned n, E* elements)
-{
+void print_elements(const unsigned n, E * elements ) {
  for (unsigned i = 0; i < n; i++) {
-    std::cout << i << ": " << elements[i] << std::endl;
+    std::cout << i << ": " << elements[i] << std::endl;   
  }
 }

-void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
-{
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
  // Lowest Harmonics
-  for (unsigned i = 0; i < ntt_size; i = i + 1) {
+  for (unsigned i = 0; i < ntt_size; i=i+1) {
    elements[i] = E::one();
  }
  // print_elements(ntt_size, elements );
  // Highest Harmonics
-  for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
-    elements[i] = E::one();
-    elements[i + 1] = E::neg(scalar_t::one());
+  for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
+    elements[i] =  E::one();
+    elements[i+1] = E::neg(scalar_t::one());
  }
  // print_elements(ntt_size, &elements[1*ntt_size] );
 }
@@ -38,7 +34,7 @@ void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* eleme
 int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
 {
  int nof_errors = 0;
-  E amplitude = E::from((uint32_t)ntt_size);
+  E amplitude = E::from((uint32_t) ntt_size);
  // std::cout << "Amplitude: " << amplitude << std::endl;
  // Lowest Harmonics
  if (elements[0] != amplitude) {
@@ -48,8 +44,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
  } else {
    std::cout << "Validated lowest harmonics" << std::endl;
  }
-  // Highest Harmonics
-  if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
+  // Highest Harmonics 
+  if (elements[1*ntt_size+ntt_size/2] != amplitude) {
    ++nof_errors;
    std::cout << "Error in highest harmonics! " << std::endl;
    // print_elements(ntt_size, &elements[1*ntt_size] );
@@ -59,11 +55,6 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element
  return nof_errors;
 }

-using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-
 int main(int argc, char* argv[])
 {
  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
@@ -75,39 +66,33 @@ int main(int argc, char* argv[])
  const unsigned nof_ntts = 2;
  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
  const unsigned batch_size = nof_ntts * ntt_size;
-
+  
  std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
  E* input;
-  input = (E*)malloc(sizeof(E) * batch_size);
-  initialize_input(ntt_size, nof_ntts, input);
+  input = (E*) malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, input );
  E* output;
-  output = (E*)malloc(sizeof(E) * batch_size);
-
+  output = (E*) malloc(sizeof(E) * batch_size);
+  
  std::cout << "Running NTT with on-host data" << std::endl;
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
  // Create a device context
  auto ctx = device_context::get_default_device_context();
-  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
-  InitDomain(basic_root, ctx);
+  // the next line is valid only for CURVE_ID 1 (will add support for other curves soon)
+  S rou = S{ {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x6d1e303, 0x27508aba, 0xa0ed063, 0x26125da1} };
+  ntt::InitDomain(rou, ctx);
  // Create an NTTConfig instance
-  NTTConfig<S> config = DefaultNTTConfig<S>();
-  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  ntt::NTTConfig<S> config=ntt::DefaultNTTConfig<S>();
  config.batch_size = nof_ntts;
-  START_TIMER(MixedRadix);
-  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
-  END_TIMER(MixedRadix, "MixedRadix NTT");
-  
-  std::cout << "Validating output" << std::endl;
-  validate_output(ntt_size, nof_ntts, output);
-
-  config.ntt_algorithm = NttAlgorithm::Radix2; 
-  START_TIMER(Radix2);
-  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
-  END_TIMER(Radix2, "Radix2 NTT");
-
-  std::cout << "Validating output" << std::endl;
-  validate_output(ntt_size, nof_ntts, output);
-
-  std::cout << "Cleaning-up memory" << std::endl;
+  config.ctx.stream = stream;
+  auto begin0 = std::chrono::high_resolution_clock::now();
+  cudaError_t err = ntt::NTT<S, E>(input, ntt_size, ntt::NTTDir::kForward, config, output);
+  auto end0 = std::chrono::high_resolution_clock::now();
+  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
+  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
+  validate_output(ntt_size, nof_ntts, output );
+  cudaStreamDestroy(stream);
  free(input);
  free(output);
  return 0;
--- a/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
+++ b/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
@@ -1,25 +0,0 @@
-# Make sure NVIDIA Container Toolkit is installed on your host
-
-# Use the specified base image
-FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
-
-# Update and install dependencies
-RUN apt-get update && apt-get install -y \
-    cmake \
-    curl \
-    build-essential \
-    git \
-    libboost-all-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-# Clone Icicle from a GitHub repository
-RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
-
-# Set the working directory in the container
-WORKDIR /icicle-example
-
-# Specify the default command for the container
-CMD ["/bin/bash"]
-
-
-
--- a/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
+++ b/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
@@ -1,22 +0,0 @@
-{
-    "name": "Icicle Examples: polynomial multiplication",
-    "build": {
-        "dockerfile": "Dockerfile"
-    },
-    "runArgs": [
-        "--gpus",
-        "all"
-    ],
-    "postCreateCommand": [
-        "nvidia-smi"
-    ],
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "ms-vscode.cmake-tools",
-                "ms-python.python",
-                "ms-vscode.cpptools"
-            ]
-        }
-    }
-}
--- a/examples/c++/polynomial_multiplication/CMakeLists.txt
+++ b/examples/c++/polynomial_multiplication/CMakeLists.txt
@@ -1,26 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(icicle LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
-add_executable(
-  example
-  example.cu
-)
-
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
--- a/examples/c++/polynomial_multiplication/compile.sh
+++ b/examples/c++/polynomial_multiplication/compile.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
-
-
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -1,124 +0,0 @@
-#define CURVE_ID BLS12_381
-
-#include <chrono>
-#include <iostream>
-#include <vector>
-
-#include "curves/curve_config.cuh"
-#include "appUtils/ntt/ntt.cu"
-#include "appUtils/ntt/kernel_ntt.cu"
-#include "utils/vec_ops.cu"
-#include "utils/error_handler.cuh"
-#include <memory>
-
-typedef curve_config::scalar_t test_scalar;
-typedef curve_config::scalar_t test_data;
-
-void random_samples(test_data* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++)
-    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
-}
-
-void incremental_values(test_scalar* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++) {
-    res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
-  }
-}
-
-// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
-// (1) allocate A,B on CPU. Randomize first half, zero second half
-// (2) allocate NttAGpu, NttBGpu on GPU
-// (3) calc NTT for A and for B from cpu to GPU
-// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
-// (5) INTT MulGpu inplace
-
-int main(int argc, char** argv)
-{
-  cudaEvent_t start, stop;
-  float measured_time;
-
-  int NTT_LOG_SIZE = 23;
-  int NTT_SIZE = 1 << NTT_LOG_SIZE;
-
-  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context
-
-  // init domain
-  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
-  const bool is_radix2_alg = (argc > 1) ? atoi(argv[1]) : false;
-  ntt_config.ntt_algorithm = is_radix2_alg ? ntt::NttAlgorithm::Radix2 : ntt::NttAlgorithm::MixedRadix;
-
-  const char* ntt_alg_str = is_radix2_alg ? "Radix-2" : "Mixed-Radix";
-  std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";
-
-  CHK_IF_RETURN(cudaEventCreate(&start));
-  CHK_IF_RETURN(cudaEventCreate(&stop));
-
-  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::InitDomain(basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);
-
-  // (1) cpu allocation
-  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
-  auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
-  random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
-  random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros
-
-  test_data *GpuA, *GpuB, *MulGpu;
-
-  auto benchmark = [&](bool print, int iterations = 1) {
-    // start recording
-    CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));
-
-    for (int iter = 0; iter < iterations; ++iter) {
-      // (2) gpu input allocation
-      CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-      CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-
-      // (3) NTT for A,B from cpu to gpu
-      ntt_config.are_inputs_on_device = false;
-      ntt_config.are_outputs_on_device = true;
-      ntt_config.ordering = ntt::Ordering::kNM;
-      CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
-      CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
-
-      // (4) multiply A,B
-      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-      vec_ops::VecOpsConfig<test_data> config {
-        ntt_config.ctx,
-        true,  // is_a_on_device
-        true,  // is_b_on_device
-        true,  // is_result_on_device
-        false, // is_montgomery
-        false  // is_async
-      };
-      CHK_IF_RETURN(
-        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
-
-      // (5) INTT (in place)
-      ntt_config.are_inputs_on_device = true;
-      ntt_config.are_outputs_on_device = true;
-      ntt_config.ordering = ntt::Ordering::kMN;
-      CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
-
-      CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
-    }
-
-    CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));
-
-    if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }
-
-    return CHK_LAST();
-  };
-
-  benchmark(false); // warmup
-  benchmark(true, 20);
-
-  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-
-  return 0;
-}
--- a/examples/c++/polynomial_multiplication/run.sh
+++ b/examples/c++/polynomial_multiplication/run.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-./build/example 1 # radix2
-./build/example 0 # mixed-radix
--- a/examples/c++/poseidon/compile.sh
+++ b/examples/c++/poseidon/compile.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
--- a/examples/c++/poseidon/run.sh
+++ b/examples/c++/poseidon/run.sh
@@ -1,2 +0,0 @@
-#!/bin/bash
-./build/example
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -1,17 +1,17 @@
 [package]
 name = "msm"
-version = "1.2.0"
+version = "1.0.0"
 edition = "2018"

 [dependencies]
-icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
-icicle-core = { path = "../../../wrappers/rust/icicle-core" }
-icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
-icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = [ "g2" ] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
 ark-bn254 = { version = "0.4.0", optional = true}
 ark-bls12-377 = { version = "0.4.0", optional = true}
 ark-ec = { version = "0.4.0", optional = true}
-clap = { version = "<=4.4.12", features = ["derive"] }
+clap = { version = "4.4.12", features = ["derive"] }

 [features]
 arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -1,20 +1,43 @@
-use icicle_bn254::curve::{CurveCfg, G1Projective, G2CurveCfg, G2Projective, ScalarCfg};
-
-use icicle_bls12_377::curve::{
-    CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
+use icicle_bn254::curve::{
+    CurveCfg,
+    ScalarCfg,
+    G1Projective,
+    G2CurveCfg,
+    G2Projective 
 };

-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_bls12_377::curve::{
+    CurveCfg as BLS12377CurveCfg,
+    ScalarCfg as BLS12377ScalarCfg,
+    G1Projective as BLS12377G1Projective
+};

-use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::HostOrDeviceSlice
+};
+
+use icicle_core::{
+    msm,
+    curve::Curve,
+    traits::GenerateRandom
+};

 #[cfg(feature = "arkworks")]
 use icicle_core::traits::ArkConvertible;

 #[cfg(feature = "arkworks")]
-use ark_bls12_377::{Fr as Bls12377Fr, G1Affine as Bls12377G1Affine, G1Projective as Bls12377ArkG1Projective};
+use ark_bn254::{
+    G1Projective as Bn254ArkG1Projective,
+    G1Affine as Bn254G1Affine,
+    Fr as Bn254Fr
+};
 #[cfg(feature = "arkworks")]
-use ark_bn254::{Fr as Bn254Fr, G1Affine as Bn254G1Affine, G1Projective as Bn254ArkG1Projective};
+use ark_bls12_377::{
+    G1Projective as Bls12377ArkG1Projective,
+    G1Affine as Bls12377G1Affine,
+    Fr as Bls12377Fr
+};
 #[cfg(feature = "arkworks")]
 use ark_ec::scalar_mul::variable_base::VariableBaseMSM;

@@ -30,7 +53,7 @@ struct Args {
    lower_bound_log_size: u8,

    /// Upper bound of MSM sizes to run for
-    #[arg(short, long, default_value_t = 22)]
+    #[arg(short, long, default_value_t = 23)]
    upper_bound_log_size: u8,
 }

@@ -44,26 +67,23 @@ fn main() {
    let upper_points = CurveCfg::generate_random_affine_points(upper_size);
    let g2_upper_points = G2CurveCfg::generate_random_affine_points(upper_size);
    let upper_scalars = ScalarCfg::generate_random(upper_size);
-
+    
    println!("Generating random inputs on host for bls12377...");
    let upper_points_bls12377 = BLS12377CurveCfg::generate_random_affine_points(upper_size);
    let upper_scalars_bls12377 = BLS12377ScalarCfg::generate_random(upper_size);

-    for i in lower_bound..=upper_bound {
+    for i in lower_bound..=upper_bound { 
        let log_size = i;
        let size = 1 << log_size;
-        println!(
-            "---------------------- MSM size 2^{}={} ------------------------",
-            log_size, size
-        );
+        println!("---------------------- MSM size 2^{}={} ------------------------", log_size, size);
        // Setting Bn254 points and scalars
        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-
+        
        // Setting bls12377 points and scalars
        // let points_bls12377 = &upper_points_bls12377[..size];
-        let points_bls12377 = HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
+        let points_bls12377 =  HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());

        println!("Configuring bn254 MSM...");
@@ -71,24 +91,18 @@ fn main() {
        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream = CudaStream::create().unwrap();
        let g2_stream = CudaStream::create().unwrap();
-        let mut cfg = msm::MSMConfig::default();
-        let mut g2_cfg = msm::MSMConfig::default();
-        cfg.ctx
-            .stream = &stream;
-        g2_cfg
-            .ctx
-            .stream = &g2_stream;
+        let mut cfg = msm::get_default_msm_config::<CurveCfg>();
+        let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
+        cfg.ctx.stream = &stream;
+        g2_cfg.ctx.stream = &g2_stream;
        cfg.is_async = true;
        g2_cfg.is_async = true;

        println!("Configuring bls12377 MSM...");
-        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> =
-            HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream_bls12377 = CudaStream::create().unwrap();
-        let mut cfg_bls12377 = msm::MSMConfig::default();
-        cfg_bls12377
-            .ctx
-            .stream = &stream_bls12377;
+        let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+        cfg_bls12377.ctx.stream = &stream_bls12377;
        cfg_bls12377.is_async = true;

        println!("Executing bn254 MSM on device...");
@@ -96,37 +110,22 @@ fn main() {
        let start = Instant::now();
        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
        #[cfg(feature = "profile")]
-        println!(
-            "ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
-            start
-                .elapsed()
-                .as_millis()
-        );
+        println!("ICICLE BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();

+
        println!("Executing bls12377 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(
-            &scalars_bls12377,
-            &points_bls12377,
-            &cfg_bls12377,
-            &mut msm_results_bls12377,
-        )
-        .unwrap();
+        msm::msm(&scalars_bls12377, &points_bls12377, &cfg_bls12377, &mut msm_results_bls12377 ).unwrap();
        #[cfg(feature = "profile")]
-        println!(
-            "ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms",
-            start
-                .elapsed()
-                .as_millis()
-        );
+        println!("ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

        println!("Moving results to host..");
        let mut msm_host_result = vec![G1Projective::zero(); 1];
        let mut g2_msm_host_result = vec![G2Projective::zero(); 1];
        let mut msm_host_result_bls12377 = vec![BLS12377G1Projective::zero(); 1];
-
+        
        stream
            .synchronize()
            .unwrap();
@@ -141,7 +140,7 @@ fn main() {
            .unwrap();
        println!("bn254 result: {:#?}", msm_host_result);
        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
-
+        
        stream_bls12377
            .synchronize()
            .unwrap();
@@ -149,70 +148,37 @@ fn main() {
            .copy_to_host(&mut msm_host_result_bls12377[..])
            .unwrap();
        println!("bls12377 result: {:#?}", msm_host_result_bls12377);
-
+        
        #[cfg(feature = "arkworks")]
        {
            println!("Checking against arkworks...");
-            let ark_points: Vec<Bn254G1Affine> = points
-                .as_slice()
-                .iter()
-                .map(|&point| point.to_ark())
-                .collect();
-            let ark_scalars: Vec<Bn254Fr> = scalars
-                .as_slice()
-                .iter()
-                .map(|scalar| scalar.to_ark())
-                .collect();
+            let ark_points: Vec<Bn254G1Affine> = points.as_slice().iter().map(|&point| point.to_ark()).collect();
+            let ark_scalars: Vec<Bn254Fr> = scalars.as_slice().iter().map(|scalar| scalar.to_ark()).collect();

-            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
-                .as_slice()
-                .iter()
-                .map(|point| point.to_ark())
-                .collect();
-            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-                .as_slice()
-                .iter()
-                .map(|scalar| scalar.to_ark())
-                .collect();
+            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377.as_slice().iter().map(|point| point.to_ark()).collect();
+            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.as_slice().iter().map(|scalar| scalar.to_ark()).collect();

            #[cfg(feature = "profile")]
            let start = Instant::now();
            let bn254_ark_msm_res = Bn254ArkG1Projective::msm(&ark_points, &ark_scalars).unwrap();
            println!("Arkworks Bn254 result: {:#?}", bn254_ark_msm_res);
            #[cfg(feature = "profile")]
-            println!(
-                "Ark BN254 MSM on size 2^{log_size} took: {} ms",
-                start
-                    .elapsed()
-                    .as_millis()
-            );
+            println!("Ark BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

            #[cfg(feature = "profile")]
            let start = Instant::now();
-            let bls12377_ark_msm_res =
-                Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
+            let bls12377_ark_msm_res = Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
            println!("Arkworks Bls12377 result: {:#?}", bls12377_ark_msm_res);
            #[cfg(feature = "profile")]
-            println!(
-                "Ark BLS12377 MSM on size 2^{log_size} took: {} ms",
-                start
-                    .elapsed()
-                    .as_millis()
-            );
+            println!("Ark BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

            let bn254_icicle_msm_res_as_ark = msm_host_result[0].to_ark();
            let bls12377_icicle_msm_res_as_ark = msm_host_result_bls12377[0].to_ark();

-            println!(
-                "Bn254 MSM is correct: {}",
-                bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark)
-            );
-            println!(
-                "Bls12377 MSM is correct: {}",
-                bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark)
-            );
+            println!("Bn254 MSM is correct: {}", bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark));
+            println!("Bls12377 MSM is correct: {}", bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark));
        }
-
+        
        println!("Cleaning up bn254...");
        stream
            .destroy()
--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -1,20 +1,20 @@
 [package]
 name = "ntt"
-version = "1.2.0"
+version = "1.0.0"
 edition = "2018"

 [dependencies]
-icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
-icicle-core = { path = "../../../wrappers/rust/icicle-core", features = ["arkworks"] }
-icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["arkworks"] }
-icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377", features = ["arkworks"] }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.1.0", features = ["arkworks"] }

 ark-ff = { version = "0.4.0" }
 ark-poly = "0.4.0"
 ark-std = "0.4.0"
 ark-bn254 = { version = "0.4.0" }
 ark-bls12-377 = { version = "0.4.0" }
-clap = { version = "<=4.4.12", features = ["derive"] }
+clap = { version = "4.4.12", features = ["derive"] }

 [features]
 profile = []
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -1,18 +1,28 @@
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
+use icicle_bn254::curve::{
+    ScalarCfg,
+    ScalarField,
+};

-use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};
+use icicle_bls12_377::curve::{
+    ScalarCfg as BLS12377ScalarCfg,
+    ScalarField as BLS12377ScalarField
+};

-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::HostOrDeviceSlice,
+    device_context::get_default_device_context
+};

 use icicle_core::{
    ntt::{self, NTT},
-    traits::{FieldImpl, GenerateRandom},
+    traits::{GenerateRandom, FieldImpl}
 };

 use icicle_core::traits::ArkConvertible;

-use ark_bls12_377::Fr as Bls12377Fr;
 use ark_bn254::Fr as Bn254Fr;
+use ark_bls12_377::Fr as Bls12377Fr;
 use ark_ff::FftField;
 use ark_poly::{EvaluationDomain, Radix2EvaluationDomain};
 use ark_std::cmp::{Ord, Ordering};
@@ -35,52 +45,37 @@ fn main() {
    println!("Running Icicle Examples: Rust NTT");
    let log_size = args.size;
    let size = 1 << log_size;
-    println!(
-        "---------------------- NTT size 2^{}={} ------------------------",
-        log_size, size
-    );
+    println!("---------------------- NTT size 2^{}={} ------------------------", log_size, size);
    // Setting Bn254 points and scalars
    println!("Generating random inputs on host for bn254...");
    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
-
+    
    // Setting bls12377 points and scalars
    println!("Generating random inputs on host for bls12377...");
    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
-    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> =
-        HostOrDeviceSlice::cuda_malloc(size).unwrap();
-
+    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    
    println!("Setting up bn254 Domain...");
-    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
-        size.try_into()
-            .unwrap(),
-    )
-    .unwrap();
-    let ctx = DeviceContext::default();
+    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
+    let ctx = get_default_device_context();
    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();

    println!("Configuring bn254 NTT...");
    let stream = CudaStream::create().unwrap();
-    let mut cfg = ntt::NTTConfig::default();
-    cfg.ctx
-        .stream = &stream;
+    let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+    cfg.ctx.stream = &stream;
    cfg.is_async = true;

    println!("Setting up bls12377 Domain...");
-    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(
-        size.try_into()
-            .unwrap(),
-    )
-    .unwrap();
+    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
    // reusing ctx from above
    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();

    println!("Configuring bls12377 NTT...");
    let stream_bls12377 = CudaStream::create().unwrap();
-    let mut cfg_bls12377 = ntt::NTTConfig::default();
-    cfg_bls12377
-        .ctx
-        .stream = &stream_bls12377;
+    let mut cfg_bls12377 = ntt::get_default_ntt_config::<BLS12377ScalarField>();
+    cfg_bls12377.ctx.stream = &stream_bls12377;
    cfg_bls12377.is_async = true;

    println!("Executing bn254 NTT on device...");
@@ -88,30 +83,14 @@ fn main() {
    let start = Instant::now();
    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
    #[cfg(feature = "profile")]
-    println!(
-        "ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
-        start
-            .elapsed()
-            .as_micros()
-    );
+    println!("ICICLE BN254 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());

    println!("Executing bls12377 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(
-        &scalars_bls12377,
-        ntt::NTTDir::kForward,
-        &cfg_bls12377,
-        &mut ntt_results_bls12377,
-    )
-    .unwrap();
+    ntt::ntt(&scalars_bls12377, ntt::NTTDir::kForward, &cfg_bls12377, &mut ntt_results_bls12377).unwrap();
    #[cfg(feature = "profile")]
-    println!(
-        "ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs",
-        start
-            .elapsed()
-            .as_micros()
-    );
+    println!("ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());

    println!("Moving results to host..");
    stream
@@ -121,7 +100,7 @@ fn main() {
    ntt_results
        .copy_to_host(&mut host_bn254_results[..])
        .unwrap();
-
+    
    stream_bls12377
        .synchronize()
        .unwrap();
@@ -129,43 +108,25 @@ fn main() {
    ntt_results_bls12377
        .copy_to_host(&mut host_bls12377_results[..])
        .unwrap();
-
+    
    println!("Checking against arkworks...");
-    let mut ark_scalars: Vec<Bn254Fr> = scalars
-        .as_slice()
-        .iter()
-        .map(|scalar| scalar.to_ark())
-        .collect();
+    let mut ark_scalars: Vec<Bn254Fr> = scalars.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
-
-    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-        .as_slice()
-        .iter()
-        .map(|scalar| scalar.to_ark())
-        .collect();
+    
+    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.as_slice().iter().map(|scalar| scalar.to_ark()).collect();
    let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
-
+    
    #[cfg(feature = "profile")]
    let start = Instant::now();
    bn254_domain.fft_in_place(&mut ark_scalars);
    #[cfg(feature = "profile")]
-    println!(
-        "Ark BN254 NTT on size 2^{log_size} took: {} ms",
-        start
-            .elapsed()
-            .as_millis()
-    );
+    println!("Ark BN254 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

    #[cfg(feature = "profile")]
    let start = Instant::now();
    bls12_377_domain.fft_in_place(&mut ark_scalars_bls12377);
    #[cfg(feature = "profile")]
-    println!(
-        "Ark BLS12377 NTT on size 2^{log_size} took: {} ms",
-        start
-            .elapsed()
-            .as_millis()
-    );
+    println!("Ark BLS12377 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());

    host_bn254_results
        .iter()
@@ -174,7 +135,7 @@ fn main() {
            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
        });
    println!("Bn254 NTT is correct");
-
+    
    host_bls12377_results
        .iter()
        .zip(ark_scalars_bls12377.iter())
@@ -183,7 +144,7 @@ fn main() {
        });

    println!("Bls12377 NTT is correct");
-
+    
    println!("Cleaning up bn254...");
    stream
        .destroy()
--- a/examples/rust/poseidon/.devcontainer/devcontainer.json
+++ b/examples/rust/poseidon/.devcontainer/devcontainer.json
@@ -1,5 +1,5 @@
 {
-    "name": "Icicle Examples: rust poseidon",
+    "name": "Icicle Examples: rust poseidon hash",
    "build": {
        "dockerfile": "Dockerfile"
    },
--- a/examples/rust/poseidon/Cargo.toml
+++ b/examples/rust/poseidon/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
-name = "poseidon"
-version = "1.4.0"
+name = "posedion"
+version = "1.0.0"
 edition = "2018"

 [dependencies]
-icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
-icicle-core = { path = "../../../wrappers/rust/icicle-core" }
-icicle-bls12-381 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-381" }
-
-clap = { version = "<=4.4.12", features = ["derive"] }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.2.0" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.2.0", features = ["arkworks"] }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.2.0", features = ["arkworks"] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.2.0", features = ["arkworks"] }
+icicle-bls12-381 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v1.2.0", features = ["arkworks"] }

 [features]
 profile = []
--- a/examples/rust/poseidon/README.md
+++ b/examples/rust/poseidon/README.md
@@ -0,0 +1,50 @@
+# ICICLE example: Poseidon hash in Rust
+
+## Key-Takeaway
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Poseidon hash](https://github.com/ingonyama-zk/ingopedia/blob/9f602aae051100ee4c60791db5c6fa23d01e1f79/src/hashzk.md?plain=1#L30).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
+
+## Usage
+
+```rust
+poseidon::poseidon_hash_many<F>(
+    input: &mut HostOrDeviceSlice<F>, // a pointer to a vector of input data
+    output: &mut HostOrDeviceSlice<F>, // a pointer to a vector of output data,
+    number_of_states: u32, // number of input blocks of size `arity`
+    arity: u32, // the arity of the hash function
+    constants: &PoseidonConstants<F>, // Poseidon constants
+    config: &PoseidonConfig, // config used to specify extra arguments of the Poseidon
+) -> IcicleResult<()>
+```
+
+In this example we use the `BN254`, `BLS12377` and `BLS12381` fields.
+
+## What's in this example
+
+1. Load optimized Poseidon hash constants.
+2. Generate custom Poseidon hash constants.
+3. Configure Poseidon hash to use inputs and outputs on device
+4. Execute Poseidon Hash on-device
+
+Running the example:
+
+```sh
+cargo run --release
+```
+
+You can add the `--feature profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `--size <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+cargo run --release -- -s 23
+```
+
+## Benchmarks
+
+TODO
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -1,10 +1,8 @@
-use icicle_bls12_381::curve::ScalarField as F;
+use icicle_bls12_381::poseidon;

-use icicle_cuda_runtime::device_context::DeviceContext;
+use icicle_cuda_runtime::device_context::get_default_device_context;

 use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
-use icicle_core::traits::FieldImpl;
-use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 #[cfg(feature = "profile")]
 use std::time::Instant;
@@ -19,14 +17,13 @@ struct Args {
 }

 fn main() {
-    let args = Args::parse();
    let size = args.size;
    let test_size = 1 << size;

    println!("Running Icicle Examples: Rust Poseidon Hash");
    let arity = 2u32;
    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
-    let ctx = DeviceContext::default();
+    let ctx = get_default_device_context();
    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
    let config = PoseidonConfig::default();

@@ -50,4 +47,4 @@ fn main() {
    .unwrap();
    #[cfg(feature = "profile")]
    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
-}
+}
--- a/go.mod
+++ b/go.mod
@@ -3,19 +3,15 @@ module github.com/ingonyama-zk/icicle
 go 1.20

 require (
-	github.com/consensys/gnark-crypto v0.12.1
-	github.com/stretchr/testify v1.8.2
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/kr/pretty v0.1.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
 )

 require (
-	github.com/bits-and-blooms/bitset v1.7.0 // indirect
-	github.com/consensys/bavard v0.1.13 // indirect
-	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/kr/text v0.2.0 // indirect
-	github.com/mmcloughlin/addchain v0.4.0 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/rogpeppe/go-internal v1.12.0 // indirect
-	golang.org/x/sys v0.9.0 // indirect
-	gopkg.in/yaml.v3 v3.0.1 // indirect
+	github.com/consensys/bavard v0.1.13
+	github.com/stretchr/testify v1.8.3
 	rsc.io/tmplfunc v0.0.3 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,37 +1,19 @@
-github.com/bits-and-blooms/bitset v1.7.0 h1:YjAGVd3XmtK9ktAbX8Zg2g2PwLIMjGREZJHlV4j7NEo=
-github.com/bits-and-blooms/bitset v1.7.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
 github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
 github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
-github.com/consensys/gnark-crypto v0.12.1 h1:lHH39WuuFgVHONRl3J0LRBtuYdQTumFSDtJF7HpyG8M=
-github.com/consensys/gnark-crypto v0.12.1/go.mod h1:v2Gy7L/4ZRosZ7Ivs+9SfUDr0f5UlG+EM5t7MPHiLuY=
-github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
-github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
-github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
-github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
-github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
-github.com/mmcloughlin/addchain v0.4.0 h1:SobOdjm2xLj1KkXN5/n0xTIWyZA2+s99UCY1iPfkHRY=
-github.com/mmcloughlin/addchain v0.4.0/go.mod h1:A86O+tHqZLMNO4w6ZZ4FlVQEadcoqkyU72HC5wJ4RlU=
-github.com/mmcloughlin/profile v0.1.1/go.mod h1:IhHD7q1ooxgwTgjxQYkACGA77oFTDdFVejUS1/tS/qU=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
-github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
-github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
-golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
+github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
--- a/goicicle/Makefile
+++ b/goicicle/Makefile
@@ -0,0 +1,34 @@
+CUDA_ROOT_DIR = /usr/local/cuda
+NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
+CFLAGS = -Xcompiler -fPIC -std=c++17
+LDFLAGS = -shared
+FEATURES = -DG2_DEFINED
+
+TARGET_BN254 = libbn254.so
+TARGET_BW6761 = libbw6761.so
+TARGET_BLS12_381 = libbls12_381.so
+TARGET_BLS12_377 = libbls12_377.so
+
+VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381:../icicle/curves/bw6_761
+
+SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BW6761 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
+SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+
+all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
+
+$(TARGET_BN254): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
+
+$(TARGET_BW6761): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bw6_761/, $(SRCS_BW6761)) -o $@
+
+$(TARGET_BLS12_381):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
+
+$(TARGET_BLS12_377):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
+
+clean:
+	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377) $(TARGET_BW6761)
--- a/goicicle/README.md
+++ b/goicicle/README.md
@@ -0,0 +1,82 @@
+# Golang Bindings
+
+To build the shared library:
+
+To build shared libraries for all supported curves.
+
+```
+make all
+```
+
+If you wish to build for a specific curve, for example bn254.
+
+```
+make libbn254.so
+```
+
+The current supported options are `libbn254.so`, `libbls12_381.so`, `libbls12_377.so` and `libbw6_671.so`. The resulting `.so` files are the compiled shared libraries for each curve.
+
+Finally to allow your system to find the shared libraries
+
+```
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/<path_to_shared_libs>
+```
+
+## Running golang tests
+
+To run the tests for curve bn254.
+
+```
+go test ./goicicle/curves/bn254 -count=1
+```
+
+## Cleaning up
+
+If you want to remove the compiled files
+
+```
+make clean
+```
+
+This will remove all shared libraries generated from the `make` file.
+
+# How do Golang bindings work?
+
+The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+
+1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`, `libbw6_671.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+
+2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+    // Now you can call the C functions from the ICICLE libraries.
+    // Note that C function calls are prefixed with 'C.' in Go code.
+}
+```
+
+Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+# Common issues
+
+### Cannot find shared library
+
+In some cases you may encounter the following error, despite exporting the correct `LD_LIBRARY_PATH`.
+
+```
+/usr/local/go/pkg/tool/linux_amd64/link: running gcc failed: exit status 1
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+/usr/bin/ld: cannot find -lbn254: No such file or directory
+collect2: error: ld returned 1 exit status
+```
+
+This is normally fixed by exporting the path to the shared library location in the following way: `export CGO_LDFLAGS="-L/<path_to_shared_lib>/"`
--- a/goicicle/curves/bls12377/g1.go
+++ b/goicicle/curves/bls12377/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constructors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bls12_377(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBLS12_377
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12_377(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBLS12_377
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BLS12_377_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BLS12_377_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12_377(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12_377(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12_377(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/goicicle/curves/bls12377/g1_test.go
+++ b/goicicle/curves/bls12377/g1_test.go
@@ -0,0 +1,198 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBLS12_377One(t *testing.T) {
+	var oneField G1BaseField
+	oneField.SetOne()
+
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.S, rawOneField)
+}
+
+func TestNewFieldBLS12_377Zero(t *testing.T) {
+	var zeroField G1BaseField
+	zeroField.SetZero()
+
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.S, rawZeroField)
+}
+
+func TestFieldBLS12_377ToBytesLe(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	expected := make([]byte, len(p.X.S)*4) // each uint32 takes 4 bytes
+	for i, v := range p.X.S {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, p.X.ToBytesLe(), expected)
+	assert.Equal(t, len(p.X.ToBytesLe()), 32)
+}
+
+func TestNewPointBLS12_377Zero(t *testing.T) {
+	var pointZero G1ProjectivePoint
+	pointZero.SetZero()
+
+	var baseOne G1BaseField
+	baseOne.SetOne()
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, pointZero.X, zeroSanity)
+	assert.Equal(t, pointZero.Y, baseOne)
+	assert.Equal(t, pointZero.Z, zeroSanity)
+}
+
+func TestFromProjectiveToAffine(t *testing.T) {
+	var projective G1ProjectivePoint
+	var affine G1PointAffine
+
+	projective.Random()
+
+	affine.FromProjective(&projective)
+	var projective2 G1ProjectivePoint
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestBLS12_377Eq(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+	var p2 G1ProjectivePoint
+	p2.Random()
+
+	assert.Equal(t, p1.Eq(&p1), true)
+	assert.Equal(t, p1.Eq(&p2), false)
+}
+
+func TestBLS12_377StripZ(t *testing.T) {
+	var p1 G1ProjectivePoint
+	p1.Random()
+
+	p2ZLess := p1.StripZ()
+
+	assert.IsType(t, G1PointAffine{}, *p2ZLess)
+	assert.Equal(t, p1.X, p2ZLess.X)
+	assert.Equal(t, p1.Y, p2ZLess.Y)
+}
+
+func TestPointBLS12_377fromLimbs(t *testing.T) {
+	var p G1ProjectivePoint
+	p.Random()
+
+	x := p.X.Limbs()
+	y := p.Y.Limbs()
+	z := p.Z.Limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	var pFromLimbs G1ProjectivePoint
+	pFromLimbs.FromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBLS12_377Zero(t *testing.T) {
+	var zeroP G1PointAffine
+
+	var zeroSanity G1BaseField
+	zeroSanity.SetZero()
+
+	assert.Equal(t, zeroP.X, zeroSanity)
+	assert.Equal(t, zeroP.Y, zeroSanity)
+}
+
+func TestPointAffineNoInfinityBLS12_377FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := [12]uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
+	y := [12]uint32{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+	xSlice := x[:]
+	ySlice := y[:]
+
+	// Execute your function
+	var result G1PointAffine
+	result.FromLimbs(&xSlice, &ySlice)
+
+	var xBase G1BaseField
+	var yBase G1BaseField
+	xBase.FromLimbs(x)
+	yBase.FromLimbs(y)
+
+	// Define your expected result
+	expected := G1PointAffine{
+		X: xBase,
+		Y: yBase,
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, expected, result)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := GetFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		GetFixedLimbs(&slice)
+	})
+}
--- a/goicicle/curves/bls12377/g2.go
+++ b/goicicle/curves/bls12377/g2.go
@@ -0,0 +1,102 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"encoding/binary"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+// G2 extension field
+
+type G2Element [6]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	X, Y ExtentionField
+}
+
+type G2Point struct {
+	X, Y, Z ExtentionField
+}
+
+func (p *G2Point) Random() *G2Point {
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	C.random_g2_projective_bls12_377(outC)
+
+	return p
+}
+
+func (p *G2Point) FromAffine(affine *G2PointAffine) *G2Point {
+	out := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(affine))
+
+	C.g2_projective_from_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) Eq(pCompare *G2Point) bool {
+	// Cast *PointBLS12_377 to *C.BLS12_377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12_377(pC, pCompareC))
+}
+
+func (f *G2Element) ToBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+func (p *G2PointAffine) FromProjective(projective *G2Point) *G2PointAffine {
+	out := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(projective))
+
+	C.g2_projective_to_affine_bls12_377(out, in)
+
+	return p
+}
+
+func (p *G2Point) IsOnCurve() bool {
+	// Directly copy memory from the C struct to the Go struct
+	point := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(p))
+	res := C.g2_projective_is_on_curve_bls12_377(point)
+
+	return bool(res)
+}
--- a/goicicle/curves/bls12377/g2_test.go
+++ b/goicicle/curves/bls12377/g2_test.go
@@ -0,0 +1,79 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestG2Eqg2(t *testing.T) {
+	var point G2Point
+
+	point.Random()
+
+	assert.True(t, point.Eq(&point))
+}
+
+func TestG2FromProjectiveToAffine(t *testing.T) {
+	var projective G2Point
+	projective.Random()
+
+	var affine G2PointAffine
+	affine.FromProjective(&projective)
+
+	var projective2 G2Point
+	projective2.FromAffine(&affine)
+
+	assert.True(t, projective.IsOnCurve())
+	assert.True(t, projective2.IsOnCurve())
+	assert.True(t, projective.Eq(&projective2))
+}
+
+func TestG2Eqg2NotEqual(t *testing.T) {
+	var point G2Point
+	point.Random()
+
+	var point2 G2Point
+	point2.Random()
+
+	assert.False(t, point.Eq(&point2))
+}
+
+func TestG2ToBytes(t *testing.T) {
+	element := G2Element{0x6546098ea84b6298, 0x4a384533d1f68aca, 0xaa0666972d771336, 0x1569e4a34321993}
+	bytes := element.ToBytesLe()
+
+	assert.Equal(t, bytes, []byte{0x98, 0x62, 0x4b, 0xa8, 0x8e, 0x9, 0x46, 0x65, 0xca, 0x8a, 0xf6, 0xd1, 0x33, 0x45, 0x38, 0x4a, 0x36, 0x13, 0x77, 0x2d, 0x97, 0x66, 0x6, 0xaa, 0x93, 0x19, 0x32, 0x34, 0x4a, 0x9e, 0x56, 0x1})
+}
+
+func TestG2ShouldConvertToProjective(t *testing.T) {
+	fmt.Print() // this prevents the test from hanging. TODO: figure out why
+	var pointProjective G2Point
+	pointProjective.Random()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromProjective(&pointProjective)
+
+	var proj G2Point
+	proj.FromAffine(&pointAffine)
+
+	assert.True(t, proj.IsOnCurve())
+	assert.True(t, pointProjective.Eq(&proj))
+}
--- a/goicicle/curves/bls12377/include/msm.h
+++ b/goicicle/curves/bls12377/include/msm.h
@@ -0,0 +1,98 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// msm.h
+
+#ifndef _BLS12_377_MSM_H
+#define _BLS12_377_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bls12_377(
+  BLS12_377_projective_t* out, BLS12_377_affine_t* points, BLS12_377_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_377(
+  BLS12_377_projective_t* out,
+  BLS12_377_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+
+int commit_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+
+int commit_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id);
+
+int msm_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t count,
+  size_t device_id);
+int msm_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* out,
+  BLS12_377_g2_affine_t* points,
+  BLS12_377_scalar_t* scalars,
+  size_t batch_size,
+  size_t msm_size,
+  size_t device_id);
+int commit_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  unsigned large_bucket_factor,
+  size_t device_id);
+int commit_batch_g2_cuda_bls12_377(
+  BLS12_377_g2_projective_t* d_out,
+  BLS12_377_scalar_t* d_scalars,
+  BLS12_377_g2_affine_t* d_points,
+  size_t count,
+  size_t batch_size,
+  size_t device_id,
+  cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_MSM_H */
--- a/goicicle/curves/bls12377/include/ntt.h
+++ b/goicicle/curves/bls12377/include/ntt.h
@@ -0,0 +1,195 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ntt.h
+
+#ifndef _BLS12_377_NTT_H
+#define _BLS12_377_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12_377 projective and affine structs
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+
+int ntt_cuda_bls12_377(BLS12_377_scalar_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ntt_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_377(BLS12_377_projective_t* arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+BLS12_377_scalar_t*
+build_domain_cuda_bls12_377(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int interpolate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int interpolate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_evaluations,
+  BLS12_377_scalar_t* d_domain,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  size_t device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  unsigned device_id,
+  size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out,
+  BLS12_377_scalar_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int evaluate_points_on_coset_batch_cuda_bls12_377(
+  BLS12_377_projective_t* d_out,
+  BLS12_377_projective_t* d_coefficients,
+  BLS12_377_scalar_t* d_domain,
+  unsigned domain_size,
+  unsigned n,
+  unsigned batch_size,
+  BLS12_377_scalar_t* coset_powers,
+  size_t device_id,
+  size_t stream);
+int reverse_order_scalars_cuda_bls12_377(BLS12_377_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bls12_377(
+  BLS12_377_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bls12_377(BLS12_377_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bls12_377(
+  BLS12_377_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bls12_377(
+  BLS12_377_scalar_t* d_out, BLS12_377_scalar_t* d_in1, BLS12_377_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bls12_377(BLS12_377_scalar_t* d_inout, unsigned n, size_t stream);
+
+// points g1
+int to_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_cuda_bls12_377(BLS12_377_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_cuda_bls12_377(BLS12_377_affine_t* d_inout, unsigned n, size_t stream);
+
+// points g2
+int to_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_proj_points_g2_cuda_bls12_377(BLS12_377_g2_projective_t* d_inout, unsigned n, size_t stream);
+int to_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_aff_points_g2_cuda_bls12_377(BLS12_377_g2_affine_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_NTT_H */
--- a/goicicle/curves/bls12377/include/projective.h
+++ b/goicicle/curves/bls12377/include/projective.h
@@ -0,0 +1,50 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// projective.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_g2_projective_t BLS12_377_g2_projective_t;
+typedef struct BLS12_377_affine_t BLS12_377_affine_t;
+typedef struct BLS12_377_g2_affine_t BLS12_377_g2_affine_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+bool projective_is_on_curve_bls12_377(BLS12_377_projective_t* point1);
+
+int random_scalar_bls12_377(BLS12_377_scalar_t* out);
+int random_projective_bls12_377(BLS12_377_projective_t* out);
+BLS12_377_projective_t* projective_zero_bls12_377();
+int projective_to_affine_bls12_377(BLS12_377_affine_t* out, BLS12_377_projective_t* point1);
+int projective_from_affine_bls12_377(BLS12_377_projective_t* out, BLS12_377_affine_t* point1);
+
+int random_g2_projective_bls12_377(BLS12_377_g2_projective_t* out);
+int g2_projective_to_affine_bls12_377(BLS12_377_g2_affine_t* out, BLS12_377_g2_projective_t* point1);
+int g2_projective_from_affine_bls12_377(BLS12_377_g2_projective_t* out, BLS12_377_g2_affine_t* point1);
+bool g2_projective_is_on_curve_bls12_377(BLS12_377_g2_projective_t* point1);
+
+bool eq_bls12_377(BLS12_377_projective_t* point1, BLS12_377_projective_t* point2);
+bool eq_g2_bls12_377(BLS12_377_g2_projective_t* point1, BLS12_377_g2_projective_t* point2);
+
+#ifdef __cplusplus
+}
+#endif
--- a/goicicle/curves/bls12377/include/ve_mod_mult.h
+++ b/goicicle/curves/bls12377/include/ve_mod_mult.h
@@ -0,0 +1,49 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BLS12_377_VEC_MULT_H
+#define _BLS12_377_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12_377_projective_t BLS12_377_projective_t;
+typedef struct BLS12_377_scalar_t BLS12_377_scalar_t;
+
+int32_t vec_mod_mult_point_bls12_377(
+  BLS12_377_projective_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bls12_377(
+  BLS12_377_scalar_t* inout, BLS12_377_scalar_t* scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_377(
+  BLS12_377_scalar_t* matrix_flattened,
+  BLS12_377_scalar_t* input,
+  BLS12_377_scalar_t* output,
+  size_t n_elments,
+  size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12_377_VEC_MULT_H */
--- a/goicicle/curves/bls12377/msm.go
+++ b/goicicle/curves/bls12377/msm.go
@@ -0,0 +1,209 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "msm.h"
+import "C"
+
+func Msm(out *G1ProjectivePoint, points []G1PointAffine, scalars []G1ScalarField, device_id int) (*G1ProjectivePoint, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2(out *G2Point, points []G2PointAffine, scalars []G1ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch(points *[]G1PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G1ProjectivePoint, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G1ProjectivePoint, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		var p G1ProjectivePoint
+		p.SetZero()
+
+		out[i] = p
+	}
+
+	outC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_377_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2Batch(points *[]G2PointAffine, scalars *[]G1ScalarField, batchSize, deviceId int) ([]G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]G2Point, batchSize)
+
+	outC := (*C.BLS12_377_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12_377_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_377_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count, bucketFactor int) int {
+	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	largeBucketFactorC := C.uint(bucketFactor)
+
+	ret := C.commit_g2_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, largeBucketFactorC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_377_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bls12_377(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitG2Batch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BLS12_377_g2_projective_t)(d_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.msm_batch_g2_cuda_bls12_377(d_outC, pointsC, scalarsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bls12377/msm_test.go
+++ b/goicicle/curves/bls12377/msm_test.go
@@ -0,0 +1,360 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func GeneratePoints(count int) []G1PointAffine {
+	// Declare a slice of integers
+	var points []G1PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		var pointProjective G1ProjectivePoint
+		pointProjective.Random()
+
+		var pointAffine G1PointAffine
+		pointAffine.FromProjective(&pointProjective)
+
+		points = append(points, pointAffine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func GeneratePointsProj(count int) []G1ProjectivePoint {
+	// Declare a slice of integers
+	var points []G1ProjectivePoint
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		var p G1ProjectivePoint
+		p.Random()
+
+		points = append(points, p)
+	}
+
+	return points
+}
+
+func GenerateScalars(count int, skewed bool) []G1ScalarField {
+	// Declare a slice of integers
+	var scalars []G1ScalarField
+
+	var rand G1ScalarField
+	var zero G1ScalarField
+	var one G1ScalarField
+	var randLarge G1ScalarField
+
+	zero.SetZero()
+	one.SetOne()
+	randLarge.Random()
+
+	if skewed && count > 1_200_000 {
+		for i := 0; i < count-1_200_000; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+
+		for i := 0; i < 600_000; i++ {
+			scalars = append(scalars, randLarge)
+		}
+		for i := 0; i < 400_000; i++ {
+			scalars = append(scalars, zero)
+		}
+		for i := 0; i < 200_000; i++ {
+			scalars = append(scalars, one)
+		}
+	} else {
+		for i := 0; i < count; i++ {
+			rand.Random()
+			scalars = append(scalars, rand)
+		}
+	}
+
+	return scalars[:count]
+}
+
+func TestMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G1ProjectivePoint)
+		startTime := time.Now()
+		_, e := Msm(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1<<v - 1
+
+		points := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := count * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G1ProjectivePoint, 1)
+		goicicle.CudaMemCpyDtoH[G1ProjectivePoint](outHost, out_d, 96)
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.True(t, outHost[0].IsOnCurve())
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[G1PointAffine](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize, 10)
+
+				if e != 0 {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestBatchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GeneratePoints(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmBatch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GeneratePoints(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G1ProjectivePoint)
+				_, e := Msm(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+// G2
+func GenerateG2Points(count int) []G2PointAffine {
+	// Declare a slice of integers
+	var points []G2PointAffine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		fmt.Print() // this prevents the test from hanging. TODO: figure out why
+		var p G2Point
+		p.Random()
+		var affine G2PointAffine
+		affine.FromProjective(&p)
+
+		points = append(points, affine)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		points = append(points, points...)
+	}
+
+	return points[:count]
+}
+
+func TestMsmG2BLS12_377(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+		assert.True(t, out.IsOnCurve())
+	}
+}
+
+func BenchmarkMsmG2BLS12_377(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points := GenerateG2Points(msmSize)
+		scalars := GenerateScalars(msmSize, false)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occurred")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{8} {
+		count := 1 << v
+
+		points := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars := GenerateScalars(count, false)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[G1ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count, 10)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.IsOnCurve())
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points := GenerateG2Points(count)
+			scalars := GenerateScalars(count, false)
+
+			pointsResults, e := MsmG2Batch(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBLS12_377 returned an error: %v", e)
+			}
+
+			if len(pointsResults) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(pointsResults))
+			}
+
+			for _, s := range pointsResults {
+				assert.True(t, s.IsOnCurve())
+			}
+		}
+	}
+}
--- a/goicicle/curves/bls12377/ntt.go
+++ b/goicicle/curves/bls12377/ntt.go
@@ -0,0 +1,222 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "ntt.h"
+import "C"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt(scalars *[]G1ScalarField, isInverse bool, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch(scalars *[]G1ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BLS12_377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt(values *[]G1ProjectivePoint, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bls12_377(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch(values *[]G1ProjectivePoint, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BLS12_377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bls12_377(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bls12_377(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bls12_377(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 32
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BLS12_377_scalar_t)(dp)
+	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
+	cosetPowersC := (*C.BLS12_377_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bls12_377(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BLS12_377_scalar_t)(scalars_out)
+	scalarsC := (*C.BLS12_377_scalar_t)(scalars)
+	twiddlesC := (*C.BLS12_377_scalar_t)(twiddles)
+	coset_powersC := (*C.BLS12_377_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bls12_377(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BLS12_377_scalar_t)(in1_d)
+	in2_dC := (*C.BLS12_377_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bls12_377(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BLS12_377_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bls12_377(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_377_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func G2AffinePointFromMontgomery(d_points unsafe.Pointer, len int) (int, error) {
+	pointsC := (*C.BLS12_377_g2_affine_t)(d_points)
+	lenC := C.uint(len)
+
+	if success := C.from_montgomery_aff_points_g2_cuda_bls12_377(pointsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
--- a/goicicle/curves/bls12377/ntt_test.go
+++ b/goicicle/curves/bls12377/ntt_test.go
@@ -0,0 +1,148 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"reflect"
+	"testing"
+)
+
+func TestNttBLS12_377Batch(t *testing.T) {
+	count := 1 << 20
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatch(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_377CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestINttBLS12_377CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, true, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	assert.Equal(t, nttResult, nttResult)
+}
+
+func TestNttBLS12_377(t *testing.T) {
+	count := 1 << 3
+
+	scalars := GenerateScalars(count, false)
+
+	nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	Ntt(&nttResult, false, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]G1ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	Ntt(&inttResult, true, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBLS12_377(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars := GenerateScalars(count*batches, false)
+
+	var scalarVecOfVec [][]G1ScalarField = make([][]G1ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]G1ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]G1ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatch(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]G1ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]G1ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bls12_377 function
+		Ntt(&nttResultVecOfVec[i], false, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars := GenerateScalars(nttSize, false)
+
+			nttResult := make([]G1ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				Ntt(&nttResult, false, 0)
+			}
+		})
+	}
+}
--- a/goicicle/curves/bls12377/utils.go
+++ b/goicicle/curves/bls12377/utils.go
@@ -0,0 +1,38 @@
+package bls12377
+
+import "encoding/binary"
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr4(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
+
+func ConvertUint64ArrToUint32Arr6(arr64 [6]uint64) [12]uint32 {
+	var arr32 [12]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
--- a/goicicle/curves/bls12377/vec_mod.go
+++ b/goicicle/curves/bls12377/vec_mod.go
@@ -0,0 +1,42 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_377
+// #include "ve_mod_mult.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
+	scalarVec1C := (*C.BLS12_377_scalar_t)(scalarVec1)
+	scalarVec2C := (*C.BLS12_377_scalar_t)(scalarVec2)
+	sizeC := C.size_t(size)
+
+	ret := C.vec_mod_mult_device_scalar_bls12_377(scalarVec1C, scalarVec2C, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error multiplying scalar vectors")
+		return -1
+	}
+
+	return 0
+}
--- a/goicicle/curves/bls12381/g1.go
+++ b/goicicle/curves/bls12381/g1.go
@@ -0,0 +1,328 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+)
+
+// #cgo CFLAGS: -I./include/
+// #cgo CFLAGS: -I/usr/local/cuda/include
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbls12_381
+// #include "projective.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type G1ScalarField struct {
+	S [SCALAR_SIZE]uint32
+}
+
+type G1BaseField struct {
+	S [BASE_SIZE]uint32
+}
+
+/*
+ * BaseField Constructors
+ */
+
+func (f *G1BaseField) SetZero() *G1BaseField {
+	var S [BASE_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1BaseField) SetOne() *G1BaseField {
+	var S [BASE_SIZE]uint32
+
+	S[0] = 1
+
+	f.S = S
+	return f
+}
+
+func (p *G1ProjectivePoint) FromAffine(affine *G1PointAffine) *G1ProjectivePoint {
+	out := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	in := (*C.BLS12_381_affine_t)(unsafe.Pointer(affine))
+
+	C.projective_from_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (f *G1BaseField) FromLimbs(limbs [BASE_SIZE]uint32) *G1BaseField {
+	copy(f.S[:], limbs[:])
+
+	return f
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *G1BaseField) Limbs() [BASE_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1BaseField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (p *G1ScalarField) Random() *G1ScalarField {
+	outC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(p))
+	C.random_scalar_bls12_381(outC)
+
+	return p
+}
+
+func (f *G1ScalarField) SetZero() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	f.S = S
+
+	return f
+}
+
+func (f *G1ScalarField) SetOne() *G1ScalarField {
+	var S [SCALAR_SIZE]uint32
+	S[0] = 1
+	f.S = S
+
+	return f
+}
+
+func (a *G1ScalarField) Eq(b *G1ScalarField) bool {
+	for i, v := range a.S {
+		if b.S[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *G1ScalarField) Limbs() [SCALAR_SIZE]uint32 {
+	return f.S
+}
+
+func (f *G1ScalarField) ToBytesLe() []byte {
+	bytes := make([]byte, len(f.S)*4)
+	for i, v := range f.S {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+/*
+ * PointBLS12_381
+ */
+
+type G1ProjectivePoint struct {
+	X, Y, Z G1BaseField
+}
+
+func (f *G1ProjectivePoint) SetZero() *G1ProjectivePoint {
+	var yOne G1BaseField
+	yOne.SetOne()
+
+	var xZero G1BaseField
+	xZero.SetZero()
+
+	var zZero G1BaseField
+	zZero.SetZero()
+
+	f.X = xZero
+	f.Y = yOne
+	f.Z = zZero
+
+	return f
+}
+
+func (p *G1ProjectivePoint) Eq(pCompare *G1ProjectivePoint) bool {
+	// Cast *PointBLS12_381 to *C.BLS12_381_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It'S your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12_381_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it'S fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12_381(pC, pCompareC))
+}
+
+func (p *G1ProjectivePoint) IsOnCurve() bool {
+	point := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	res := C.projective_is_on_curve_bls12_381(point)
+
+	return bool(res)
+}
+
+func (p *G1ProjectivePoint) Random() *G1ProjectivePoint {
+	outC := (*C.BLS12_381_projective_t)(unsafe.Pointer(p))
+	C.random_projective_bls12_381(outC)
+
+	return p
+}
+
+func (p *G1ProjectivePoint) StripZ() *G1PointAffine {
+	return &G1PointAffine{
+		X: p.X,
+		Y: p.Y,
+	}
+}
+
+func (p *G1ProjectivePoint) FromLimbs(x, y, z *[]uint32) *G1ProjectivePoint {
+	var _x G1BaseField
+	var _y G1BaseField
+	var _z G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(x))
+	_y.FromLimbs(GetFixedLimbs(y))
+	_z.FromLimbs(GetFixedLimbs(z))
+
+	p.X = _x
+	p.Y = _y
+	p.Z = _z
+
+	return p
+}
+
+/*
+ * PointAffineNoInfinityBLS12_381
+ */
+
+type G1PointAffine struct {
+	X, Y G1BaseField
+}
+
+func (p *G1PointAffine) FromProjective(projective *G1ProjectivePoint) *G1PointAffine {
+	in := (*C.BLS12_381_projective_t)(unsafe.Pointer(projective))
+	out := (*C.BLS12_381_affine_t)(unsafe.Pointer(p))
+
+	C.projective_to_affine_bls12_381(out, in)
+
+	return p
+}
+
+func (p *G1PointAffine) ToProjective() *G1ProjectivePoint {
+	var Z G1BaseField
+	Z.SetOne()
+
+	return &G1ProjectivePoint{
+		X: p.X,
+		Y: p.Y,
+		Z: Z,
+	}
+}
+
+func (p *G1PointAffine) FromLimbs(X, Y *[]uint32) *G1PointAffine {
+	var _x G1BaseField
+	var _y G1BaseField
+
+	_x.FromLimbs(GetFixedLimbs(X))
+	_y.FromLimbs(GetFixedLimbs(Y))
+
+	p.X = _x
+	p.Y = _y
+
+	return p
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []G1ProjectivePoint, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12_381_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12_381(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12_381(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []G1ScalarField, b []G1ScalarField, deviceID int) {
+	c := make([]G1ScalarField, len(b))
+	for i := range c {
+		var p G1ScalarField
+		p.SetZero()
+
+		c[i] = p
+	}
+
+	aC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12_381_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12_381(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func GetFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
--- a/Show More
+++ b/Show More