vec ops compiles

poseidon compiles
polynomial compiles
2026-01-13 01:17:57 -05:00 · 2024-05-12 14:01:17 +03:00 · 2024-05-12 13:43:47 +03:00 · 2024-05-12 13:28:11 +03:00 · 2024-05-12 12:55:07 +03:00 · 2024-05-09 11:31:22 +03:00
642 changed files with 39553 additions and 10130 deletions
--- a/.codespellignore
+++ b/.codespellignore
@@ -3,3 +3,4 @@ crate
 lmit
 mut
 uint
+dout
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -4,7 +4,7 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2

 jobs:
  spelling-checker:
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -29,7 +29,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi

-  test-linux:
+  test-linux-curve:
    name: Test on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
@@ -39,14 +39,36 @@ jobs:
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
-    - name: Build
+    - name: Build curve
      working-directory: ./icicle
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
-        mkdir -p build
-        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -DG2_DEFINED=ON -S . -B build
-        cmake --build build
-    - name: Run C++ Tests
-      working-directory: ./icicle/build
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ curve Tests
+      working-directory: ./icicle/build/tests
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: ctest
+
+  test-linux-field:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        field: [babybear]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Build field
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ field Tests
+      working-directory: ./icicle/build/tests
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -11,11 +11,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -33,13 +33,23 @@ jobs:
      if: needs.check-changed-files.outputs.golang == 'true'
      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi

-  build-linux:
-    name: Build on Linux
+  build-curves-linux:
+    name: Build curves on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
+        curve: 
+          - name: bn254
+            build_args: -g2 -ecntt
+          - name: bls12_381
+            build_args: -g2 -ecntt
+          - name: bls12_377
+            build_args: -g2 -ecntt
+          - name: bw6_761
+            build_args: -g2 -ecntt
+          - name: grumpkin
+            build_args:
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -50,19 +60,79 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh ${{ matrix.curve }} ON # builds a single curve with G2 enabled
+      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
    - name: Upload ICICLE lib artifacts
      uses: actions/upload-artifact@v4
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      with:
-        name: icicle-builds-${{ matrix.curve }}-${{ github.workflow }}-${{ github.sha }}
-        path: icicle/build/libingo_${{ matrix.curve }}.a
+        name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
+        path: |
+          icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
+          icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
+        retention-days: 1
+ 
+  build-fields-linux:
+    name: Build fields on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        field:
+          - name: babybear
+            build_args: -field-ext
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Setup go
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.20.0'
+    - name: Build
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
+    - name: Upload ICICLE lib artifacts
+      uses: actions/upload-artifact@v4
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      with:
+        name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
+        path: |
+          icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
+        retention-days: 1
+    
+  build-hashes-linux:
+    name: Build hashes on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        hash:
+          - name: keccak
+            build_args:
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Setup go
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.20.0'
+    - name: Build
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }} # builds a single hash algorithm
+    - name: Upload ICICLE lib artifacts
+      uses: actions/upload-artifact@v4
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      with:
+        name: icicle-builds-${{ matrix.hash.name }}-${{ github.workflow }}-${{ github.sha }}
+        path: |
+          icicle/build/lib/libingo_hash.a
        retention-days: 1
  
  test-linux:
    name: Test on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-linux]
+    needs: [check-changed-files, build-curves-linux, build-fields-linux, build-hashes-linux]
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -74,7 +144,7 @@ jobs:
      uses: actions/download-artifact@v4
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      with:
-        path: ./icicle/build/
+        path: ./icicle/build/lib
        merge-multiple: true
    - name: Run Tests
      working-directory: ./wrappers/golang
@@ -83,7 +153,7 @@ jobs:
      # -p controls the number of programs that can be run in parallel
      run: |
        export CPATH=$CPATH:/usr/local/cuda/include
-        go test --tags=g2 ./... -count=1 -failfast -p 2 -timeout 60m
+        go test ./... -count=1 -failfast -p 2 -timeout 60m
  
  # TODO: bw6 on windows requires more memory than the standard runner has
  # Add a large runner and then enable this job
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -4,11 +4,11 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@@ -60,7 +60,24 @@ jobs:
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      # Running tests from the root workspace will run all workspace members' tests by default
      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+      # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
+      run: |
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2
+
+    - name: Run baby bear tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        cargo test --release --verbose -- --ignored
+        cargo test --release --verbose
+
+    - name: Run stark252 tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-stark252
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        cargo test --release --verbose -- --ignored
+        cargo test --release --verbose

  build-windows:
    name: Build on Windows
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,6 @@
 **/Cargo.lock
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
-**/build
+**/build*
 **/icicle/appUtils/large_ntt/work
 icicle/appUtils/large_ntt/work/test_ntt
--- a/README.md
+++ b/README.md
@@ -11,8 +11,6 @@
  </a>
  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
-  </a>
-  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
  <a href="https://github.com/ingonyama-zk/icicle/releases">
    <img src="https://img.shields.io/github/v/release/ingonyama-zk/icicle" alt="GitHub Release">
  </a>
@@ -117,8 +115,11 @@ This will ensure our custom hooks are run and will make it easier to follow our

 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab
 - [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
+- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
+- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
+- [PatStiles](https://github.com/PatStiles), for adding Stark252 field

 ## Help & Support

--- a/docs/docs/icicle/core.md
+++ b/docs/docs/icicle/core.md
@@ -0,0 +1,196 @@
+# ICICLE Core
+
+ICICLE Core is a library written in C++/CUDA. All the ICICLE primitives are implemented within ICICLE Core.
+
+The Core is split into logical modules that can be compiled into static libraries using different [strategies](#compilation-strategies). You can then [link](#linking) these libraries with your C++ project or write your own [bindings](#writing-new-bindings-for-icicle) for other programming languages. If you want to use ICICLE with existing bindings please refer to the [Rust](/icicle/rust-bindings) or [Golang](/icicle/golang-bindings) bindings documentation.
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | [bn254](https://neuromancer.sk/std/bn/bn254) | [bls12-377](https://neuromancer.sk/std/bls/BLS12-377) | [bls12-381](https://neuromancer.sk/std/bls/BLS12-381) | [bw6-761](https://eprint.iacr.org/2020/351) | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| [MSM][MSM_DOCS] | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [NTT][NTT_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [VecOps][VECOPS_CODE] | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Polynomials][POLY_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [Poseidon](primitives/poseidon) | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Merkle Tree](primitives/poseidon#the-tree-builder) | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+### Supported fields and operations
+
+| Operation\Field | [babybear](https://eprint.iacr.org/2023/824.pdf) | [Stark252](https://docs.starknet.io/documentation/architecture_and_concepts/Cryptography/p-value/) |
+| --- | :---: | :---: |
+| [VecOps][VECOPS_CODE] | ✅ | ✅ |
+| [Polynomials][POLY_DOCS] | ✅ | ✅ |
+| [NTT][NTT_DOCS] | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
+
+### Supported hashes
+
+| Hash | Sizes |
+| --- | :---: |
+| Keccak | 256, 512 |
+
+## Compilation strategies
+
+Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields, you compile ICICLE once per curve or field that is needed. It's that simple. Currently, the following choices are supported:
+
+- [Field mode][COMPILE_FIELD_MODE] - used for STARK fields like BabyBear / Mersenne / Goldilocks. Includes field arithmetic, NTT, Poseidon, Extension fields and other primitives.
+- [Curve mode][COMPILE_CURVE_MODE] - used for SNARK curves like BN254 / BLS curves / Grumpkin / etc. Curve mode is built upon field mode, so it includes everything that field does It also includes curve operations / MSM / ECNTT / G2 and other curve-related primitives.
+
+:::info
+
+If you only want to use a curve's scalar or base field, you still need to use curve mode. You can disable MSM with [options](#compilation-options)
+
+:::
+
+### Compiling for a field
+
+You can compile ICICLE for a field using this command:
+
+```sh
+cd icicle
+mkdir -p build
+cmake -DFIELD=<FIELD> -S . -B build
+cmake --build build -j
+```
+
+This command will output `libingo_field_<FIELD>.a` into `build/lib`.
+
+### Compiling for a curve
+
+:::note
+
+Field related primitives will be compiled for the scalar field of the curve
+
+:::
+
+You can compile ICICLE for a SNARK curve using this command:
+
+```sh
+cd icicle
+mkdir -p build
+cmake -DCURVE=<CURVE> -S . -B build
+cmake --build build -j
+```
+
+Where `<CURVE>` can be one of `bn254`/`bls12_377`/`bls12_381`/`bw6_761`/`grumpkin`.
+
+This command will output both `libingo_curve_<CURVE>.a` and `libingo_field_<CURVE>.a` into `build/lib`.
+
+### Compilation options
+
+There exist multiple options that allow you to customize your build or enable additional functionality.
+
+#### EXT_FIELD
+
+Used only in [field mode][COMPILE_FIELD_MODE] to add an Extension field. Adds all supported field operations for the extension field.
+
+Default: `OFF`
+
+Usage: `-DEXT_FIELD=ON`
+
+#### G2
+
+Used only in [curve mode][COMPILE_CURVE_MODE] to add G2 definitions. Also adds G2 MSM.
+
+Default: `OFF`
+
+Usage: `-DG2=ON`
+
+#### ECNTT
+
+Used only in [curve mode][COMPILE_CURVE_MODE] to add ECNTT function.
+
+Default: `OFF`
+
+Usage: `-DECNTT=ON`
+
+#### MSM
+
+Used only in [curve mode][COMPILE_CURVE_MODE] to add MSM function. As MSM takes a lot of time to build, you can disable it with this option to reduce compilation time.
+
+Default: `ON`
+
+Usage: `-DMSM=OFF`
+
+#### BUILD_HASH
+
+Can be used in any mode to build a hash library. Currently it only includes Keccak hash function, but more are coming.
+
+Default: `OFF`
+
+Usage: `-DBUILD_HASH=ON`
+
+#### BUILD_TESTS
+
+Can be used in any mode to include tests runner binary.
+
+Default: `OFF`
+
+USAGE: `-DBUILD_TESTS=ON`
+
+#### BUILD_BENCHMARKS
+
+Can be used in any mode to include benchmarks runner binary.
+
+Default: `OFF`
+
+USAGE: `-DBUILD_BENCHMARKS=ON`
+
+#### DEVMODE
+
+Can be used in any mode to include debug symbols in the build.
+
+Default: `OFF`
+
+USAGE: `-DEVMODE=ON`
+
+## Linking
+
+To link ICICLE with your project you first need to compile ICICLE with options of your choice. After that you can use CMake `target_link_libraries` to link with the generated static libraries and `target_include_directories` to include ICICLE headers (located in `icicle/include`).
+
+Refer to our [c++ examples](https://github.com/ingonyama-zk/icicle/tree/main/examples/c%2B%2B) for more info. Take a look at this [CMakeLists.txt](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/CMakeLists.txt#L22)
+
+## Writing new bindings for ICICLE
+
+Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
+
+Static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
+
+Let's review the [Golang bindings][GOLANG_BINDINGS] since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+  // Now you can call the C functions from the ICICLE libraries.
+  // Note that C function calls are prefixed with 'C.' in Go code.
+
+  out := (*C.BN254_projective_t)(unsafe.Pointer(p))
+  in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
+
+  C.projective_from_affine_bn254(out, in)
+}
+```
+
+The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
+
+If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
+
+<!-- Begin Links -->
+[GOLANG_BINDINGS]: golang-bindings.md
+[COMPILE_CURVE_MODE]: #compiling-for-a-curve
+[COMPILE_FIELD_MODE]: #compiling-for-a-field
+[NTT_DOCS]: primitives/ntt
+[MSM_DOCS]: primitives/msm
+[POLY_DOCS]: polynomials/overview
+[VECOPS_CODE]: https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/vec_ops/vec_ops.cuh
+<!-- End Links -->
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,7 +1,7 @@
 # Golang bindings

 Golang bindings allow you to use ICICLE as a golang library.
-The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
+The source code for all Golang packages can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).

 The Golang bindings are comprised of multiple packages.

@@ -9,7 +9,7 @@ The Golang bindings are comprised of multiple packages.

 [`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.

-Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
+Each supported curve, field, and hash has its own package which you can find in the respective directories [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang). If your project uses BN254 you only need to import that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).

 ## Using ICICLE Golang bindings in your project

@@ -31,36 +31,47 @@ For a specific commit
 go get github.com/ingonyama-zk/icicle@<commit_id>
 ```

-To build the shared libraries you can run this script:
+To build the shared libraries you can run [this](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/build.sh) script:

-```
-./build <curve> [G2_enabled]
+```sh
+./build.sh [-curve=<curve>] [-field=<field>] [-hash=<hash>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]

-curve - The name of the curve to build or "all" to build all curves
-G2_enabled - Optional - To build with G2 enabled 
+curve - The name of the curve to build or "all" to build all supported curves
+field - The name of the field to build or "all" to build all supported fields
+hash - The name of the hash to build or "all" to build all supported hashes
+-g2 - Optional - build with G2 enabled 
+-ecntt - Optional - build with ECNTT enabled
+-devmode - Optional - build in devmode
+-help - Optional - Displays usage information
 ```

-For example if you want to build all curves with G2 enabled you would run:
+:::note
+
+If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
+
+:::
+
+To build ICICLE libraries for all supported curves with G2 and ECNTT enabled.

 ```bash
-./build.sh all ON
+./build.sh -curve=all -g2 -ecntt
 ```

-If you are interested in building a specific curve you would run:
+If you wish to build for a specific curve, for example bn254, without G2 or ECNTT enabled.

-```bash
-./build.sh bls12_381 ON
+``` bash
+./build.sh -curve=bn254
 ```

 Now you can import ICICLE into your project

-```golang
+```go
 import (
    "github.com/stretchr/testify/assert"
    "testing"

-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 )
 ...
 ```
@@ -70,11 +81,9 @@ import (
 To run all tests, for all curves:

 ```bash
-go test --tags=g2 ./... -count=1
+go test ./... -count=1
 ```

-If you dont want to include g2 tests then drop `--tags=g2`.
-
 If you wish to run test for a specific curve:

 ```bash
@@ -85,13 +94,13 @@ go test <path_to_curve> -count=1

 The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.

-1. These libraries (named `libingo_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+1. These libraries (named `libingo_curve_<curve>.a` and `libingo_field_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.

 2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:

 ```go
 /*
-#cgo LDFLAGS: -L/path/to/shared/libs -lingo_bn254
+#cgo LDFLAGS: -L/path/to/shared/libs -lingo_curve_bn254 -L$/path/to/shared/libs -lingo_field_bn254 -lstdc++ -lm
 #include "icicle.h" // make sure you use the correct header file(s)
 */
 import "C"
@@ -103,3 +112,25 @@ func main() {
 ```

 Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+
+### Supported fields and operations
+
+| Operation\Field | babybear |
+| --- | :---: |
+| VecOps | ✅ |
+| Polynomials | ✅ |
+| NTT | ✅ |
+| Extension Field | ✅ |
--- a/docs/docs/icicle/golang-bindings/ecntt.md
+++ b/docs/docs/icicle/golang-bindings/ecntt.md
@@ -0,0 +1,92 @@
+# ECNTT
+
+## ECNTT Method
+
+The `ECNtt[T any]()` function performs the Elliptic Curve Number Theoretic Transform (EC-NTT) on the input points slice, using the provided dir (direction), cfg (configuration), and stores the results in the results slice.
+
+```go
+func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
+```
+
+### Parameters
+
+- **`points`**: A slice of elliptic curve points (in projective coordinates) that will be transformed. The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.
+- **`dir`**: The direction of the EC-NTT transform, either `core.KForward` or `core.KInverse`.
+- **`cfg`**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
+- **`results`**: A slice that will store the transformed elliptic curve points (in projective coordinates). The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.
+
+### Return Value
+
+- **`CudaError`**: A `core.IcicleError` value, which will be `core.IcicleErrorCode(0)` if the EC-NTT operation was successful, or an error if something went wrong.
+
+## NTT Configuration (NTTConfig)
+
+The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
+
+```go
+type NTTConfig[T any] struct {
+    Ctx cr.DeviceContext
+    CosetGen T
+    BatchSize int32
+    ColumnsBatch bool
+    Ordering Ordering
+    areInputsOnDevice  bool
+    areOutputsOnDevice bool
+    IsAsync bool
+    NttAlgorithm NttAlgorithm
+}
+```
+
+### Fields
+
+- **`Ctx`**: Device context containing details like device ID and stream ID.
+- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
+- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
+- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`), affecting how data is arranged.
+- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
+- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
+- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
+- **`NttAlgorithm`**: Explicitly select the NTT algorithm. ECNTT supports running on `Radix2` algoruithm.
+
+### Default Configuration
+
+Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
+
+```go
+func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
+```
+
+## ECNTT Example
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+)
+
+func Main() {
+    // Obtain the default NTT configuration with a predefined coset generator.
+    cfg := GetDefaultNttConfig()
+    
+    // Define the size of the input scalars.
+    size := 1 << 18
+
+    // Generate Points for the ECNTT operation.
+    points := GenerateProjectivePoints(size)
+    
+    // Set the direction of the NTT (forward or inverse).
+    dir := core.KForward
+
+    // Allocate memory for the results of the NTT operation.
+    results := make(core.HostSlice[Projective], size)
+
+    // Perform the NTT operation.
+    err := ECNtt(points, dir, &cfg, results)
+    if err != cr.CudaSuccess {
+        panic("ECNTT operation failed")
+    }
+}
+```
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -0,0 +1,112 @@
+# MSM Pre computation
+
+To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
+
+## Core package
+
+### MSM PrecomputeBases
+
+`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.
+
+#### Description
+
+This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
+
+The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
+
+#### `PrecomputeBases`
+
+Precomputes bases for MSM by extending each base point with its multiples.
+
+```go
+func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+```
+
+##### Parameters
+
+- **`points`**: A slice of the original affine points to be extended with their multiples.
+- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
+- **`c`**: Currently unused; reserved for future compatibility.
+- **`ctx`**: CUDA device context specifying the execution environment.
+- **`outputBases`**: The device slice allocated for storing the extended bases.
+
+##### Example
+
+```go
+package main
+
+import (
+	"log"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+)
+
+func main() {
+	cfg := bn254.GetDefaultMSMConfig()
+	points := bn254.GenerateAffinePoints(1024)
+	var precomputeFactor int32 = 8
+	var precomputeOut core.DeviceSlice
+	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+
+	err := bn254.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	if err != cr.CudaSuccess {
+		log.Fatalf("PrecomputeBases failed: %v", err)
+	}
+}
+```
+
+#### `G2PrecomputeBases`
+
+This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
+
+```go
+func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+```
+
+##### Parameters
+
+- **`points`**: A slice of G2 curve points to be extended.
+- **`precomputeFactor`**: The total number of points to precompute for each base.
+- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
+- **`ctx`**: Specifies the CUDA device context for execution.
+- **`outputBases`**: Allocated device slice for the extended bases.
+
+##### Example
+
+```go
+package main
+
+import (
+	"log"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+)
+
+func main() {
+	cfg := g2.G2GetDefaultMSMConfig()
+	points := g2.G2GenerateAffinePoints(1024)
+	var precomputeFactor int32 = 8
+	var precomputeOut core.DeviceSlice
+	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+
+	err := g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	if err != cr.CudaSuccess {
+		log.Fatalf("PrecomputeBases failed: %v", err)
+	}
+}
+```
+
+### Benchmarks
+
+Benchmarks where performed on a Nvidia RTX 3090Ti.
+
+| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
+| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
+| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
+| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -1,62 +1,59 @@
 # MSM

-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
 ## MSM Example

 ```go
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

-func Main() {
-    // Obtain the default MSM configuration.
-    cfg := GetDefaultMSMConfig()
-    
-    // Define the size of the problem, here 2^18.
-    size := 1 << 18
+func main() {
+  // Obtain the default MSM configuration.
+  cfg := bn254.GetDefaultMSMConfig()

-    // Generate scalars and points for the MSM operation.
-    scalars := GenerateScalars(size)
-    points := GenerateAffinePoints(size)
+  // Define the size of the problem, here 2^18.
+  size := 1 << 18

-    // Create a CUDA stream for asynchronous operations.
-    stream, _ := cr.CreateStream()
-    var p Projective
-    
-    // Allocate memory on the device for the result of the MSM operation.
-    var out core.DeviceSlice
-    _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+  // Generate scalars and points for the MSM operation.
+  scalars := bn254.GenerateScalars(size)
+  points := bn254.GenerateAffinePoints(size)

-    if e != cr.CudaSuccess {
-        panic(e)
-    }
-    
-    // Set the CUDA stream in the MSM configuration.
-    cfg.Ctx.Stream = &stream
-    cfg.IsAsync = true
-    
-    // Perform the MSM operation.
-    e = Msm(scalars, points, &cfg, out)
-    
-    if e != cr.CudaSuccess {
-        panic(e)
-    }
-    
-    // Allocate host memory for the results and copy the results from the device.
-    outHost := make(core.HostSlice[Projective], 1)
-    cr.SynchronizeStream(&stream)
-    outHost.CopyFromDevice(&out)
-    
-    // Free the device memory allocated for the results.
-    out.Free()
+  // Create a CUDA stream for asynchronous operations.
+  stream, _ := cr.CreateStream()
+  var p bn254.Projective
+
+  // Allocate memory on the device for the result of the MSM operation.
+  var out core.DeviceSlice
+  _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+
+  if e != cr.CudaSuccess {
+    panic(e)
+  }
+
+  // Set the CUDA stream in the MSM configuration.
+  cfg.Ctx.Stream = &stream
+  cfg.IsAsync = true
+
+  // Perform the MSM operation.
+  e = bn254.Msm(scalars, points, &cfg, out)
+
+  if e != cr.CudaSuccess {
+    panic(e)
+  }
+
+  // Allocate host memory for the results and copy the results from the device.
+  outHost := make(core.HostSlice[bn254.Projective], 1)
+  cr.SynchronizeStream(&stream)
+  outHost.CopyFromDevice(&out)
+
+  // Free the device memory allocated for the results.
+  out.Free()
 }
+
 ```

 ## MSM Method
@@ -67,14 +64,14 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor

 ### Parameters

- **scalars**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
- **points**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
- **cfg**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
- **results**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.
+- **`scalars`**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
+- **`points`**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
+- **`cfg`**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
+- **`results`**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.

 ### Return Value

- **CudaError**: Returns a CUDA error code indicating the success or failure of the MSM operation.
+- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the MSM operation.

 ## MSMConfig

@@ -100,19 +97,19 @@ type MSMConfig struct {

 ### Fields

- **Ctx**: Device context containing details like device id and stream.
- **PrecomputeFactor**: Controls the number of extra points to pre-compute.
- **C**: Window bitsize, a key parameter in the "bucket method" for MSM.
- **Bitsize**: Number of bits of the largest scalar.
- **LargeBucketFactor**: Sensitivity to frequently occurring buckets.
- **batchSize**: Number of results to compute in one batch.
- **areScalarsOnDevice**: Indicates if scalars are located on the device.
- **AreScalarsMontgomeryForm**: True if scalars are in Montgomery form.
- **arePointsOnDevice**: Indicates if points are located on the device.
- **ArePointsMontgomeryForm**: True if point coordinates are in Montgomery form.
- **areResultsOnDevice**: Indicates if results are stored on the device.
- **IsBigTriangle**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **IsAsync**: If true, runs MSM asynchronously.
+- **`Ctx`**: Device context containing details like device id and stream.
+- **`PrecomputeFactor`**: Controls the number of extra points to pre-compute.
+- **`C`**: Window bitsize, a key parameter in the "bucket method" for MSM.
+- **`Bitsize`**: Number of bits of the largest scalar.
+- **`LargeBucketFactor`**: Sensitivity to frequently occurring buckets.
+- **`batchSize`**: Number of results to compute in one batch.
+- **`areScalarsOnDevice`**: Indicates if scalars are located on the device.
+- **`AreScalarsMontgomeryForm`**: True if scalars are in Montgomery form.
+- **`arePointsOnDevice`**: Indicates if points are located on the device.
+- **`ArePointsMontgomeryForm`**: True if point coordinates are in Montgomery form.
+- **`areResultsOnDevice`**: Indicates if results are stored on the device.
+- **`IsBigTriangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
+- **`IsAsync`**: If true, runs MSM asynchronously.

 ### Default Configuration

@@ -122,7 +119,6 @@ Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be c
 func GetDefaultMSMConfig() MSMConfig
 ```

-
 ## How do I toggle between the supported algorithms?

 When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
@@ -157,44 +153,41 @@ out.Malloc(batchSize*p.Size(), p.Size())

 ## Support for G2 group

-To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled.
+To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled as described in the [Golang building instructions](../golang-bindings.md#using-icicle-golang-bindings-in-your-project).

-```bash
-./build.sh bls12_381 ON
-```
-
-Now when importing `icicle`, you should have access to G2 features.
+Now you may import `g2` package of the specified curve.

 ```go
 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )
 ```

-These features include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.
+This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.

 ```go
-...
+package main

-cfg := GetDefaultMSMConfig()
-size := 1 << 12
-batchSize := 3
-totalSize := size * batchSize
-scalars := GenerateScalars(totalSize)
-points := G2GenerateAffinePoints(totalSize)
+import (
+  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+  g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+)

-var p G2Projective
-var out core.DeviceSlice
-out.Malloc(batchSize*p.Size(), p.Size())
-G2Msm(scalars, points, &cfg, out)
+func main() {
+  cfg := bn254.GetDefaultMSMConfig()
+  size := 1 << 12
+  batchSize := 3
+  totalSize := size * batchSize
+  scalars := bn254.GenerateScalars(totalSize)
+  points := g2.G2GenerateAffinePoints(totalSize)
+
+  var p g2.G2Projective
+  var out core.DeviceSlice
+  out.Malloc(batchSize*p.Size(), p.Size())
+  g2.G2Msm(scalars, points, &cfg, out)
+}

-...
 ```

 `G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
-
-Additionally when you are building your application make sure to use the g2 feature flag
-
-```bash
-go build -tags=g2
-```
--- a/docs/docs/icicle/golang-bindings/multi-gpu.md
+++ b/docs/docs/icicle/golang-bindings/multi-gpu.md
@@ -0,0 +1,155 @@
+# Multi GPU APIs
+
+To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
+
+Here we will cover the core multi GPU apis and an [example](#a-multi-gpu-example)
+
+## A Multi GPU example
+
+In this example we will display how you can
+
+1. Fetch the number of devices installed on a machine
+2. For every GPU launch a thread and set an active device per thread.
+3. Execute a MSM on each GPU
+
+```go
+package main
+
+import (
+	"fmt"
+	"sync"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+)
+
+func main() {
+	numDevices, _ := cr.GetDeviceCount()
+	fmt.Println("There are ", numDevices, " devices available")
+	wg := sync.WaitGroup{}
+
+	for i := 0; i < numDevices; i++ {
+		wg.Add(1)
+		// RunOnDevice makes sure each MSM runs on a single thread
+		cr.RunOnDevice(i, func(args ...any) {
+			defer wg.Done()
+			cfg := bn254.GetDefaultMSMConfig()
+			cfg.IsAsync = true
+			for _, power := range []int{10, 18} {
+				size := 1 << power // 2^pwr
+
+				// generate random scalars
+				scalars := bn254.GenerateScalars(size)
+				points := bn254.GenerateAffinePoints(size)
+
+				// create a stream and allocate result pointer
+				stream, _ := cr.CreateStream()
+				var p bn254.Projective
+				var out core.DeviceSlice
+				out.MallocAsync(p.Size(), p.Size(), stream)
+				// assign stream to device context
+				cfg.Ctx.Stream = &stream
+
+				// execute MSM
+				bn254.Msm(scalars, points, &cfg, out)
+				// read result from device
+				outHost := make(core.HostSlice[bn254.Projective], 1)
+				outHost.CopyFromDeviceAsync(&out, stream)
+				out.FreeAsync(stream)
+
+				// sync the stream
+				cr.SynchronizeStream(&stream)
+			}
+		})
+	}
+	wg.Wait()
+}
+```
+
+This example demonstrates a basic pattern for distributing tasks across multiple GPUs. The `RunOnDevice` function ensures that each goroutine is executed on its designated GPU and a corresponding thread.
+
+## Device Management API
+
+To streamline device management we offer as part of `cuda_runtime` package methods for dealing with devices.
+
+### `RunOnDevice`
+
+Runs a given function on a specific GPU device, ensuring that all CUDA calls within the function are executed on the selected device.
+
+In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread.
+
+`RunOnDevice` was designed to solve this caveat and ensure that the goroutine will stay on a specific host thread.
+
+`RunOnDevice` locks a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.
+
+While the goroutine is locked to the host thread, the Go runtime will not assign other goroutines to that host thread.
+
+**Parameters:**
+
+- **`deviceId int`**: The ID of the device on which to run the provided function. Device IDs start from 0.
+- **`funcToRun func(args ...any)`**: The function to be executed on the specified device.
+- **`args ...any`**: Arguments to be passed to `funcToRun`.
+
+**Behavior:**
+
+- The function `funcToRun` is executed in a new goroutine that is locked to a specific OS thread to ensure that all CUDA calls within the function target the specified device.
+
+:::note
+Any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
+:::
+
+**Example:**
+
+```go
+RunOnDevice(0, func(args ...any) {
+	fmt.Println("This runs on GPU 0")
+	// CUDA-related operations here will target GPU 0
+}, nil)
+```
+
+### `SetDevice`
+
+Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.
+
+:::warning
+This function should not be used directly in conjunction with goroutines. If you want to run multi-gpu scenarios with goroutines you should use [RunOnDevice](#runondevice)
+:::
+
+**Parameters:**
+
+- **`device int`**: The ID of the device to set as the current device.
+
+**Returns:**
+
+- **`CudaError`**: Error code indicating the success or failure of the operation.
+
+### `GetDeviceCount`
+
+Retrieves the number of CUDA-capable devices available on the host.
+
+**Returns:**
+
+- **`(int, CudaError)`**: The number of devices and an error code indicating the success or failure of the operation.
+
+### `GetDevice`
+
+Gets the ID of the currently active device for the calling host thread.
+
+**Returns:**
+
+- **`(int, CudaError)`**: The ID of the current device and an error code indicating the success or failure of the operation.
+
+### `GetDeviceFromPointer`
+
+Retrieves the device associated with a given pointer.
+
+**Parameters:**
+
+- **`ptr unsafe.Pointer`**: Pointer to query.
+
+**Returns:**
+
+- **`int`**: The device ID associated with the memory pointed to by `ptr`.
+
+This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -1,40 +1,54 @@
 # NTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
 ## NTT Example

 ```go
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+  "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
 )

-func Main() {
-    // Obtain the default NTT configuration with a predefined coset generator.
-    cfg := GetDefaultNttConfig()
-    
-    // Define the size of the input scalars.
-    size := 1 << 18
+func init() {
+  cfg := bn254.GetDefaultNttConfig()
+  initDomain(18, cfg)
+}

-    // Generate scalars for the NTT operation.
-    scalars := GenerateScalars(size)
+func initDomain[T any](largestTestSize int, cfg core.NTTConfig[T]) core.IcicleError {
+  rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
+  rou := rouMont.Bits()
+  rouIcicle := bn254.ScalarField{}

-    // Set the direction of the NTT (forward or inverse).
-    dir := core.KForward
+  rouIcicle.FromLimbs(rou[:])
+  e := bn254.InitDomain(rouIcicle, cfg.Ctx, false)
+  return e
+}

-    // Allocate memory for the results of the NTT operation.
-    results := make(core.HostSlice[ScalarField], size)
+func main() {
+  // Obtain the default NTT configuration with a predefined coset generator.
+  cfg := bn254.GetDefaultNttConfig()

-    // Perform the NTT operation.
-    err := Ntt(scalars, dir, &cfg, results)
-    if err != cr.CudaSuccess {
-        panic("NTT operation failed")
-    }
+  // Define the size of the input scalars.
+  size := 1 << 18
+
+  // Generate scalars for the NTT operation.
+  scalars := bn254.GenerateScalars(size)
+
+  // Set the direction of the NTT (forward or inverse).
+  dir := core.KForward
+
+  // Allocate memory for the results of the NTT operation.
+  results := make(core.HostSlice[bn254.ScalarField], size)
+
+  // Perform the NTT operation.
+  err := bn254.Ntt(scalars, dir, &cfg, results)
+  if err.CudaErrorCode != cr.CudaSuccess {
+    panic("NTT operation failed")
+  }
 }
 ```

@@ -46,14 +60,14 @@ func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTCo

 ### Parameters

- **scalars**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
- **dir**: The direction of the NTT operation (`KForward` or `KInverse`).
- **cfg**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
- **results**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.
+- **`scalars`**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
+- **`dir`**: The direction of the NTT operation (`KForward` or `KInverse`).
+- **`cfg`**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
+- **`results`**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.

 ### Return Value

- **CudaError**: Returns a CUDA error code indicating the success or failure of the NTT operation.
+- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the NTT operation.

 ## NTT Configuration (NTTConfig)

@@ -64,22 +78,26 @@ type NTTConfig[T any] struct {
    Ctx cr.DeviceContext
    CosetGen T
    BatchSize int32
+    ColumnsBatch bool
    Ordering Ordering
    areInputsOnDevice  bool
    areOutputsOnDevice bool
    IsAsync bool
+    NttAlgorithm NttAlgorithm
 }
 ```

 ### Fields

- **Ctx**: Device context containing details like device ID and stream ID.
- **CosetGen**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
- **BatchSize**: The number of NTTs to compute in one operation, defaulting to 1.
- **Ordering**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
- **areInputsOnDevice**: Indicates if input scalars are located on the device.
- **areOutputsOnDevice**: Indicates if results are stored on the device.
- **IsAsync**: Controls whether the NTT operation runs asynchronously.
+- **`Ctx`**: Device context containing details like device ID and stream ID.
+- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
+- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
+- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
+- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
+- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
+- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
+- **`NttAlgorithm`**: Explicitly select the NTT algorithm. Default value: Auto (the implementation selects radix-2 or mixed-radix algorithm based on heuristics).

 ### Default Configuration

@@ -98,3 +116,36 @@ func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bo
 ```

 This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.
+
+### Releasing the domain
+
+The `ReleaseDomain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
+
+```go
+func ReleaseDomain(ctx cr.DeviceContext) core.IcicleError
+```
+
+### Parameters
+
+- **`ctx`**: a reference to the `DeviceContext` object, which represents the CUDA device context.
+
+### Return Value
+
+The function returns a `core.IcicleError`, which represents the result of the operation. If the operation is successful, the function returns `core.IcicleErrorCode(0)`.
+
+### Example
+
+```go
+import (
+    "github.com/icicle-crypto/icicle-core/cr"
+    "github.com/icicle-crypto/icicle-core/core"
+)
+
+func example() {
+  cfg := GetDefaultNttConfig()
+  err := ReleaseDomain(cfg.Ctx)
+  if err != nil {
+      // Handle the error
+  }
+}
+```
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -2,104 +2,112 @@

 ## Overview

-The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
+Icicle exposes a number of vector operations which a user can use:

-## Example
+* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
+* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix

-### Vector addition
+## VecOps API Documentation
+
+### Example
+
+#### Vector addition

 ```go
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
+	testSize := 1 << 12
+	a := bn254.GenerateScalars(testSize)
+	b := bn254.GenerateScalars(testSize)
+	out := make(core.HostSlice[bn254.ScalarField], testSize)
+	cfg := core.DefaultVecOpsConfig()

-    // Perform vector addition
-    err := VecOp(a, b, out, cfg, core.Add)
-    if err != cr.CudaSuccess {
-        panic("Vector addition failed")
-    }
+	// Perform vector multiplication
+	err := bn254.VecOp(a, b, out, cfg, core.Add)
+	if err != cr.CudaSuccess {
+		panic("Vector addition failed")
+	}
 }
 ```

-### Vector Subtraction
+#### Vector Subtraction

 ```go
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
+	testSize := 1 << 12
+	a := bn254.GenerateScalars(testSize)
+	b := bn254.GenerateScalars(testSize)
+	out := make(core.HostSlice[bn254.ScalarField], testSize)
+	cfg := core.DefaultVecOpsConfig()

-    // Perform vector subtraction
-    err := VecOp(a, b, out, cfg, core.Sub)
-    if err != cr.CudaSuccess {
-        panic("Vector subtraction failed")
-    }
+	// Perform vector multiplication
+	err := bn254.VecOp(a, b, out, cfg, core.Sub)
+	if err != cr.CudaSuccess {
+		panic("Vector subtraction failed")
+	}
 }
 ```

-### Vector Multiplication
+#### Vector Multiplication

 ```go
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
+	testSize := 1 << 12
+	a := bn254.GenerateScalars(testSize)
+	b := bn254.GenerateScalars(testSize)
+	out := make(core.HostSlice[bn254.ScalarField], testSize)
+	cfg := core.DefaultVecOpsConfig()

-    // Perform vector multiplication
-    err := VecOp(a, b, out, cfg, core.Mul)
-    if err != cr.CudaSuccess {
-        panic("Vector multiplication failed")
-    }
+	// Perform vector multiplication
+	err := bn254.VecOp(a, b, out, cfg, core.Mul)
+	if err != cr.CudaSuccess {
+		panic("Vector multiplication failed")
+	}
 }
 ```

-## VecOps Method
+### VecOps Method

 ```go
 func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
 ```

-### Parameters
+#### Parameters

- **a**: The first input vector.
- **b**: The second input vector.
- **out**: The output vector where the result of the operation will be stored.
- **config**: A `VecOpsConfig` object containing various configuration options for the vector operations.
- **op**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.
+- **`a`**: The first input vector.
+- **`b`**: The second input vector.
+- **`out`**: The output vector where the result of the operation will be stored.
+- **`config`**: A `VecOpsConfig` object containing various configuration options for the vector operations.
+- **`op`**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.

-### Return Value
+#### Return Value

- **CudaError**: Returns a CUDA error code indicating the success or failure of the vector operation.
+- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the vector operation.

-## VecOpsConfig
+### VecOpsConfig

 The `VecOpsConfig` structure holds configuration parameters for the vector operations, allowing customization of its behavior.

@@ -109,24 +117,72 @@ type VecOpsConfig struct {
    isAOnDevice bool
    isBOnDevice bool
    isResultOnDevice bool
-    IsResultMontgomeryForm bool
    IsAsync bool
 }
 ```

-### Fields
+#### Fields

 - **Ctx**: Device context containing details like device ID and stream ID.
 - **isAOnDevice**: Indicates if vector `a` is located on the device.
 - **isBOnDevice**: Indicates if vector `b` is located on the device.
 - **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
- **IsResultMontgomeryForm**: Determines if the result vector should be in Montgomery form.
 - **IsAsync**: Controls whether the vector operation runs asynchronously.

-### Default Configuration
+#### Default Configuration

 Use `DefaultVecOpsConfig` to obtain a default configuration, customizable as needed.

 ```go
 func DefaultVecOpsConfig() VecOpsConfig
 ```
+
+## MatrixTranspose API Documentation
+
+This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
+
+The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
+
+### Function
+
+```go
+func TransposeMatrix(in, out core.HostOrDeviceSlice, columnSize, rowSize int, ctx cr.DeviceContext, onDevice, isAsync bool) (ret core.IcicleError)
+```
+
+## Parameters
+
+- **`in`**: The input matrix is a `core.HostOrDeviceSlice`, stored as a 1D slice.
+- **`out`**: The output matrix is a `core.HostOrDeviceSlice`, which will be the transpose of the input matrix, stored as a 1D slice.
+- **`columnSize`**: The number of columns in the input matrix.
+- **`rowSize`**: The number of rows in the input matrix.
+- **`ctx`**: The device context `cr.DeviceContext` to be used for the matrix transpose operation.
+- **`onDevice`**: Indicates whether the input and output slices are stored on the device (GPU) or the host (CPU).
+- **`isAsync`**: Indicates whether the matrix transpose operation should be executed asynchronously.
+
+## Return Value
+
+The function returns a `core.IcicleError` value, which represents the result of the matrix transpose operation. If the operation is successful, the returned value will be `0`.
+
+## Example Usage
+
+```go
+var input = make(core.HostSlice[ScalarField], 20)
+var output = make(core.HostSlice[ScalarField], 20)
+
+// Populate the input matrix
+// ...
+
+// Get device context
+ctx, _ := cr.GetDefaultDeviceContext()
+
+// Transpose the matrix
+err := TransposeMatrix(input, output, 5, 4, ctx, false, false)
+if err.IcicleErrorCode != core.IcicleErrorCode(0) {
+    // Handle the error
+}
+
+// Use the transposed matrix
+// ...
+```
+
+In this example, the `TransposeMatrix` function is used to transpose a 5x4 matrix stored in a 1D slice. The input and output slices are stored on the host (CPU), and the operation is executed synchronously.
--- a/docs/docs/icicle/integrations.md
+++ b/docs/docs/icicle/integrations.md
@@ -1,6 +1,6 @@
 # ICICLE integrated provers

-ICICLE has been used by companies and projects such as [Celer Network](https://github.com/celer-network), [Consensys Gnark](https://github.com/Consensys/gnark), [EZKL](https://blog.ezkl.xyz/post/acceleration/) and others to accelerate their ZK proving pipeline.
+ICICLE has been used by companies and projects such as [Celer Network](https://github.com/celer-network), [Consensys Gnark](https://github.com/Consensys/gnark), [EZKL](https://blog.ezkl.xyz/post/acceleration/), [ZKWASM](https://twitter.com/DelphinusLab/status/1762604988797513915) and others to accelerate their ZK proving pipeline.

 Many of these integrations have been a collaboration between Ingonyama and the integrating company. We have learned a lot about designing GPU based ZK provers.

--- a/docs/docs/icicle/introduction.md
+++ b/docs/docs/icicle/introduction.md
@@ -8,24 +8,24 @@ This guide is oriented towards developers who want to start writing code with th

 The diagram above displays the general architecture of ICICLE and the API layers that exist. The CUDA API, which we also call ICICLE Core, is the lowest level and is comprised of CUDA kernels which implement all primitives such as MSM as well as C++ wrappers which expose these methods for different curves.

-ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or you can implement a wrapper for it in any language.
+ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or linked with your C++ project. You can also implement a wrapper for it in any other language.

-Based on this dependency architecture, the ICICLE repository has three main sections, each of which is independent from the other.
+Based on this dependency architecture, the ICICLE repository has three main sections:

- ICICLE core
- ICICLE Rust bindings
- ICICLE Golang bindings
+- [ICICLE Core](#icicle-core)
+- [ICICLE Rust bindings](#icicle-rust-and-golang-bindings)
+- [ICICLE Golang bindings](#icicle-rust-and-golang-bindings)

 ### ICICLE Core

-[ICICLE core](https://github.com/ingonyama-zk/icicle/tree/main/icicle) contains all the low level CUDA code implementing primitives such as [points](https://github.com/ingonyama-zk/icicle/tree/main/icicle/primitives) and [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/msm). There also exists higher level C++ wrappers to expose the low level CUDA primitives ([example](https://github.com/ingonyama-zk/icicle/blob/c1a32a9879a7612916e05aa3098f76144de4109e/icicle/appUtils/msm/msm.cu#L1)).
+[ICICLE Core](/icicle/core) is a library that directly works with GPU by defining CUDA kernels and algorithms that invoke them. It contains code for [fast field arithmetic](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/field/field.cuh), cryptographic primitives used in ZK such as [NTT](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/ntt/), [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/msm/), [Poseidon Hash](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/poseidon/), [Polynomials](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/polynomials/) and others.

-ICICLE Core would typically be compiled into a static library and used in a third party language such as Rust or Golang.
+ICICLE Core would typically be compiled into a static library and either used in a third party language such as Rust or Golang, or linked with your own C++ project.

 ### ICICLE Rust and Golang bindings

- [ICICLE Rust bindings](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust)
- [ICICLE Golang bindings](https://github.com/ingonyama-zk/icicle/tree/main/goicicle)
+- [ICICLE Rust bindings](/icicle/rust-bindings)
+- [ICICLE Golang bindings](/icicle/golang-bindings)

 These bindings allow you to easily use ICICLE in a Rust or Golang project. Setting up Golang bindings requires a bit of extra steps compared to the Rust bindings which utilize the `cargo build` tool.

@@ -33,6 +33,12 @@ These bindings allow you to easily use ICICLE in a Rust or Golang project. Setti

 This guide assumes that you have a Linux or Windows machine with an Nvidia GPU installed. If you don't have access to an Nvidia GPU you can access one for free on [Google Colab](https://colab.google/).

+:::info note
+
+ICICLE can only run on Linux or Windows. **MacOS is not supported**.
+
+:::
+
 ### Prerequisites

 - NVCC (version 12.0 or newer)
@@ -50,9 +56,9 @@ If you don't wish to install these prerequisites you can follow this tutorial us

 ### Setting up ICICLE and running tests

-The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run a ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and follow the same commands in your terminal.
+The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run an ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and [skip](#icicle-core-1) the docker section.

-#### Setting up our environment
+#### Setting up environment with Docker

 Lets begin by cloning the ICICLE repository:

@@ -105,29 +111,23 @@ ICICLE Core is found under [`<project_root>/icicle`](https://github.com/ingonyam
 cd icicle
 ```

-We are going to compile ICICLE for a specific curve
+For this example, we are going to compile ICICLE for a `bn254` curve. However other compilation strategies are supported.

 ```sh
 mkdir -p build
 cmake -S . -B build -DCURVE=bn254 -DBUILD_TESTS=ON
-cmake --build build
+cmake --build build -j
 ```

-`-DBUILD_TESTS=ON` compiles the tests, without this flag `ctest` won't work.
-`-DCURVE=bn254` tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves).
+`-DBUILD_TESTS` option compiles the tests, without this flag `ctest` won't work.
+`-DCURVE` option tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/cmake/CurvesCommon.cmake#L2).

 The output in `build` folder should include the static libraries for the compiled curve.

-:::info
-
-Make sure to only use `-DBUILD_TESTS=ON` for running tests as the archive output will only be available when `-DBUILD_TESTS=ON` is not supplied.
-
-:::
-
 To run the test

 ```sh
-cd build
+cd build/tests
 ctest
 ```

@@ -165,11 +165,56 @@ cargo bench

 #### ICICLE Golang

-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
+The Golang bindings require compiling ICICLE Core first. We supply a [build script](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/build.sh) to help build what you need.
+
+Script usage:
+
+```sh
+./build.sh [-curve=<curve>] [-field=<field>] [-hash=<hash>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]
+
+curve - The name of the curve to build or "all" to build all supported curves
+field - The name of the field to build or "all" to build all supported fields
+hash - The name of the hash to build or "all" to build all supported hashes
+-g2 - Optional - build with G2 enabled 
+-ecntt - Optional - build with ECNTT enabled
+-devmode - Optional - build in devmode
+```
+
+:::note
+
+If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
+
+:::
+
+Once the library has been built, you can use and test the Golang bindings.
+
+To test a specific curve, field or hash, change to it's directory and then run:
+
+```sh
+go test ./tests -count=1 -failfast -timeout 60m -p 2 -v
+```
+
+You will be able to see each test that runs, how long it takes and whether it passed or failed

 ### Running ICICLE examples

-ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle-examples) these examples cover some simple use cases using C++, rust and golang.
+ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/examples) these examples cover some simple use cases using C++, rust and golang.
+
+Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/example.cu).
+
+```sh
+cd examples/c++/msm
+./compile.sh
+./run.sh
+```
+
+:::tip
+
+Read through the compile.sh and CMakeLists.txt to understand how to link your own C++ project with ICICLE
+
+:::
+
+#### Running with Docker

 In each example directory, ZK-container files are located in a subdirectory `.devcontainer`.

@@ -180,21 +225,6 @@ msm/
   └── Dockerfile
 ```

-Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle-examples/blob/main/c%2B%2B/msm/example.cu).
-
-Clone the repository
-
-```sh
-git clone https://github.com/ingonyama-zk/icicle-examples.git
-cd icicle-examples
-```
-
-Enter the test directory
-
-```sh
-cd c++/msm
-```
-
 Now lets build our docker file and run the test inside it. Make sure you have installed the [optional prerequisites](#optional-prerequisites).

 ```sh
@@ -207,54 +237,11 @@ Lets start and enter the container
 docker run -it --rm --gpus all -v .:/icicle-example icicle-example-msm
 ```

-to run the example
+Inside the container you can run the same commands:

 ```sh
-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
-./build/example
+./compile.sh
+./run.sh
 ```

 You can now experiment with our other examples, perhaps try to run a rust or golang example next.
-
-## Writing new bindings for ICICLE
-
-Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
-
-static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
-
-Lets review the Golang bindings since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-  // Now you can call the C functions from the ICICLE libraries.
-  // Note that C function calls are prefixed with 'C.' in Go code.
-
-  out := (*C.BN254_projective_t)(unsafe.Pointer(p))
-  in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
-
-  C.projective_from_affine_bn254(out, in)
-}
-```
-
-The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
-
-If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
-
-### ICICLE Adapters
-
-One of the core ideas behind ICICLE is that developers can gradually accelerate their provers. Many protocols are written using other cryptographic libraries and completely replacing them may be complex and time consuming.
-
-Therefore we offer adapters for various popular libraries, these adapters allow us to convert points and scalars between different formats defined by various libraries. Here is a list:
-
-Golang adapters:
-
- [Gnark crypto adapter](https://github.com/ingonyama-zk/iciclegnark)
--- a/docs/docs/icicle/multi-gpu.md
+++ b/docs/docs/icicle/multi-gpu.md
@@ -2,7 +2,7 @@

 :::info

-If you are looking for the Multi GPU API documentation refer here for [Rust](./rust-bindings/multi-gpu.md).
+If you are looking for the Multi GPU API documentation refer [here](./rust-bindings/multi-gpu.md) for Rust and [here](./golang-bindings/multi-gpu.md) for Golang.

 :::

@@ -10,12 +10,11 @@ One common challenge with Zero-Knowledge computation is managing the large input

 Multi-GPU programming involves developing software to operate across multiple GPU devices. Lets first explore different approaches to Multi-GPU programming then we will cover how ICICLE allows you to easily develop youR ZK computations to run across many GPUs.

-
 ## Approaches to Multi GPU programming

 There are many [different strategies](https://github.com/NVIDIA/multi-gpu-programming-models) available for implementing multi GPU, however, it can be split into two categories.

-### GPU Server approach 
+### GPU Server approach

 This approach usually involves a single or multiple CPUs opening threads to read / write from multiple GPUs. You can think about it as a scaled up HOST - Device model.

@@ -23,8 +22,7 @@ This approach usually involves a single or multiple CPUs opening threads to read

 This approach won't let us tackle larger computation sizes but it will allow us to compute multiple computations which we wouldn't be able to load onto a single GPU.

-For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel. 
-
+For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel.

 ### Inter GPU approach

@@ -32,18 +30,17 @@ This approach involves a more sophisticated approach to multi GPU computation. U

 This approach requires redesigning the algorithm at the software level to be compatible with splitting amongst devices. In some cases, to lower latency to a minimum, special inter GPU connections would be installed on a server to allow direct communication between multiple GPUs.

-
-# Writing ICICLE Code for Multi GPUs
+## Writing ICICLE Code for Multi GPUs

 The approach we have taken for the moment is a GPU Server approach; we assume you have a machine with multiple GPUs and you wish to run some computation on each GPU.

 To dive deeper and learn about the API check out the docs for our different ICICLE API

 - [Rust Multi GPU APIs](./rust-bindings/multi-gpu.md)
+- [Golang Multi GPU APIs](./golang-bindings/multi-gpu.md)
 - C++ Multi GPU APIs

-
-## Best practices 
+## Best practices

 - Never hardcode device IDs, if you want your software to take advantage of all GPUs on a machine use methods such as `get_device_count` to support arbitrary number of GPUs.

@@ -57,7 +54,7 @@ Multi GPU support should work with ZK-Containers by simply defining which device
 docker run -it --gpus '"device=0,2"' zk-container-image
 ```

-If you wish to expose all GPUs 
+If you wish to expose all GPUs

 ```sh
 docker run --gpus all zk-container-image
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -2,10 +2,6 @@

 [![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)

-![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-blue)
-
-
-
 [ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs. ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.

 ICICLE allows developers with minimal GPU experience to effortlessly accelerate their ZK application; from our experiments, even the most naive implementation may yield 10X improvement in proving times.
@@ -17,28 +13,26 @@ ICICLE has been used by many leading ZK companies such as [Celer Network](https:
 We understand that not all developers have access to a GPU and we don't want this to limit anyone from developing with ICICLE.
 Here are some ways we can help you gain access to GPUs:

+:::note
+
+If none of the following options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to help you.
+
+:::
+
 ### Grants

 At Ingonyama we are interested in accelerating the progress of ZK and cryptography. If you are an engineer, developer or an academic researcher we invite you to checkout [our grant program](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges). We will give you access to GPUs and even pay you to do your dream research!

 ### Google Colab

-This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance, it's acquired with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.
+This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.

 For an extensive guide on how to setup Google Colab with ICICLE refer to [this article](./colab-instructions.md).

-If none of these options are appropriate for you reach out to us on [telegram](https://t.me/RealElan) we will do our best to help you.
-
 ### Vast.ai

 [Vast.ai](https://vast.ai/) is a global GPU marketplace where you can rent many different types of GPUs by the hour for [competitive pricing](https://vast.ai/pricing). They provide on-demand and interruptible rentals depending on your need or use case; you can learn more about their rental types [here](https://vast.ai/faq#rental-types).

-:::note
-
-If none of these options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to do our best to help you.
-
-:::
-
 ## What can you do with ICICLE?

 [ICICLE](https://github.com/ingonyama-zk/icicle) can be used in the same way you would use any other cryptography library. While developing and integrating ICICLE into many proof systems, we found some use case categories:
--- a/docs/docs/icicle/polynomials/ffi.uml
+++ b/docs/docs/icicle/polynomials/ffi.uml
@@ -0,0 +1,27 @@
+@startuml
+skinparam componentStyle uml2
+
+' Define Components
+component "C++ Template\nComponent" as CppTemplate {
+  [Parameterizable Interface]
+}
+component "C API Wrapper\nComponent" as CApiWrapper {
+  [C API Interface]
+}
+component "Rust Code\nComponent" as RustCode {
+  [Macro Interface\n(Template Instantiation)]
+}
+
+' Define Artifact
+artifact "Static Library\n«artifact»" as StaticLib
+
+' Connections
+CppTemplate -down-> CApiWrapper : Instantiates
+CApiWrapper .down.> StaticLib : Compiles into
+RustCode -left-> StaticLib : Links against\nand calls via FFI
+
+' Notes
+note right of CppTemplate : Generic C++\ntemplate implementation
+note right of CApiWrapper : Exposes C API for FFI\nto Rust/Go
+note right of RustCode : Uses macros to\ninstantiate templates
+@enduml
--- a/docs/docs/icicle/polynomials/hw_backends.uml
+++ b/docs/docs/icicle/polynomials/hw_backends.uml
@@ -0,0 +1,86 @@
+@startuml
+
+' Define Interface for Polynomial Backend Operations
+interface IPolynomialBackend {
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Define Interface for Polynomial Context (State Management)
+interface IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' PolynomialAPI now uses two strategies: Backend and Context
+class PolynomialAPI {
+    -backendStrategy: IPolynomialBackend
+    -contextStrategy: IPolynomialContext
+    -setBackendStrategy(IPolynomialBackend)
+    -setContextStrategy(IPolynomialContext)
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Backend Implementations
+class GPUPolynomialBackend implements IPolynomialBackend {
+    #gpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class ZPUPolynomialBackend implements IPolynomialBackend {
+    #zpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class TracerPolynomialBackend implements IPolynomialBackend {
+    #traceData: Data
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Context Implementations (Placeholder for actual implementation)
+class GPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class ZPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class TracerContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' Relationships
+PolynomialAPI o-- IPolynomialBackend : uses
+PolynomialAPI o-- IPolynomialContext : uses
+@enduml
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -0,0 +1,406 @@
+# Polynomial API Overview
+
+## Introduction
+
+The Polynomial API offers a robust framework for polynomial operations within a computational environment. It's designed for flexibility and efficiency, supporting a broad range of operations like arithmetic, evaluation, and manipulation, all while abstracting from the computation and storage specifics. This enables adaptability to various backend technologies, employing modern C++ practices.
+
+## Key Features
+
+### Backend Agnostic Architecture
+
+Our API is structured to be independent of any specific computational backend. While a CUDA backend is currently implemented, the architecture facilitates easy integration of additional backends. This capability allows users to perform polynomial operations without the need to tailor their code to specific hardware, enhancing code portability and scalability.
+
+### Templating in the Polynomial API
+
+The Polynomial API is designed with a templated structure to accommodate different data types for coefficients, the domain, and images. This flexibility allows the API to be adapted for various computational needs and types of data.
+
+```cpp
+template <typename Coeff, typename Domain = Coeff, typename Image = Coeff>
+class Polynomial {
+    // Polynomial class definition
+}
+```
+
+In this template:
+
+- **`Coeff`**: Represents the type of the coefficients of the polynomial.
+- **`Domain`**: Specifies the type for the input values over which the polynomial is evaluated. By default, it is the same as the type of the coefficients but can be specified separately to accommodate different computational contexts.
+- **`Image`**: Defines the type of the output values of the polynomial. This is typically the same as the coefficients.
+
+#### Default instantiation
+
+```cpp
+extern template class Polynomial<scalar_t>;
+```
+
+#### Extended use cases
+
+The templated nature of the Polynomial API also supports more complex scenarios. For example, coefficients and images could be points on an elliptic curve (EC points), which are useful in cryptographic applications and advanced algebraic structures. This approach allows the API to be extended easily to support new algebraic constructions without modifying the core implementation.
+
+### Supported Operations
+
+The Polynomial class encapsulates a polynomial, providing a variety of operations:
+
+- **Construction**: Create polynomials from coefficients or evaluations on roots-of-unity domains.
+- **Arithmetic Operations**: Perform addition, subtraction, multiplication, and division.
+- **Evaluation**: Directly evaluate polynomials at specific points or across a domain.
+- **Manipulation**: Features like slicing polynomials, adding or subtracting monomials inplace, and computing polynomial degrees.
+- **Memory Access**: Access internal states or obtain device-memory views of polynomials.
+
+## Usage
+
+This section outlines how to use the Polynomial API in C++. Bindings for Rust and Go are detailed under the Bindings sections.
+
+### Backend Initialization
+
+Initialization with an appropriate factory is required to configure the computational context and backend.
+
+```cpp
+#include "polynomials/polynomials.h"
+#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+
+// Initialize with a CUDA backend
+Polynomial::initialize(std::make_shared<CUDAPolynomialFactory>());
+```
+
+:::note
+Initialization of a factory must be done per linked curve or field.
+:::
+
+### Construction
+
+Polynomials can be constructed from coefficients, from evaluations on roots-of-unity domains, or by cloning existing polynomials.
+
+```cpp
+// Construction
+static Polynomial from_coefficients(const Coeff* coefficients, uint64_t nof_coefficients);
+static Polynomial from_rou_evaluations(const Image* evaluations, uint64_t nof_evaluations);
+// Clone the polynomial
+Polynomial clone() const;
+```
+
+Example:
+
+```cpp
+auto p_from_coeffs = Polynomial_t::from_coefficients(coeff /* :scalar_t* */, nof_coeffs);
+auto p_from_rou_evals = Polynomial_t::from_rou_evaluations(rou_evals /* :scalar_t* */, nof_evals);
+auto p_cloned = p.clone(); // p_cloned and p do not share memory
+```
+
+:::note
+The coefficients or evaluations may be allocated either on host or device memory. In both cases the memory is copied to the backend device.
+:::
+
+### Arithmetic
+
+Constructed polynomials can be used for various arithmetic operations:
+
+```cpp
+// Addition
+Polynomial operator+(const Polynomial& rhs) const; 
+Polynomial& operator+=(const Polynomial& rhs); // inplace addition
+
+// Subtraction
+Polynomial operator-(const Polynomial& rhs) const;
+
+// Multiplication
+Polynomial operator*(const Polynomial& rhs) const;
+Polynomial operator*(const Domain& scalar) const; // scalar multiplication
+
+// Division A(x) = B(x)Q(x) + R(x)
+std::pair<Polynomial, Polynomial> divide(const Polynomial& rhs) const; // returns (Q(x), R(x))
+Polynomial operator/(const Polynomial& rhs) const; // returns quotient Q(x)
+Polynomial operator%(const Polynomial& rhs) const; // returns remainder R(x)
+Polynomial divide_by_vanishing_polynomial(uint64_t degree) const; // sdivision by the vanishing polynomial V(x)=X^N-1
+```
+
+#### Example
+
+Given polynomials A(x),B(x),C(x) and V(x) the vanishing polynomial.
+
+$$
+H(x)=\frac{A(x) \cdot B(x) - C(x)}{V(x)} \space where \space V(x) = X^{N}-1
+$$
+
+```cpp
+auto H = (A*B-C).divide_by_vanishing_polynomial(N);
+```
+
+### Evaluation
+
+Evaluate polynomials at arbitrary domain points or across a domain.
+
+```cpp
+Image operator()(const Domain& x) const; // evaluate f(x)
+void evaluate(const Domain* x, Image* evals /*OUT*/) const;
+void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+```
+
+Example:
+
+```cpp
+Coeff x = rand();
+Image f_x = f(x); // evaluate f at x
+
+// evaluate f(x) on a domain
+uint64_t domain_size = ...;
+auto domain = /*build domain*/; // host or device memory
+auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
+f.evaluate_on_domain(domain, domain_size, evaluations);
+```
+
+:::note
+For special domains such as roots of unity, this method is not the most efficient for two reasons:
+
+- Need to build the domain of size N.
+- The implementation is not trying to identify this special domain.
+
+Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
+See the 'device views' section for more details.
+:::
+
+### Manipulations
+
+Beyond arithmetic, the API supports efficient polynomial manipulations:
+
+#### Monomials
+
+```cpp
+// Monomial operations
+Polynomial& add_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+Polynomial& sub_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+```
+
+The ability to add or subtract monomials directly and in-place is an efficient way to manipualte polynomials.
+
+Example:
+
+```cpp
+f.add_monomial_in_place(scalar_t::from(5)); // f(x) += 5
+f.sub_monomial_in_place(scalar_t::from(3), 8); // f(x) -= 3x^8
+```
+
+#### Computing the degree of a Polynomial
+
+```cpp
+// Degree computation
+int64_t degree();
+```
+
+The degree of a polynomial is a fundamental characteristic that describes the highest power of the variable in the polynomial expression with a non-zero coefficient.
+The `degree()` function in the API returns the degree of the polynomial, corresponding to the highest exponent with a non-zero coefficient.
+
+- For the polynomial $f(x) = x^5 + 2x^3 + 4$, the degree is 5 because the highest power of $x$ with a non-zero coefficient is 5.
+- For a scalar value such as a constant term (e.g., $f(x) = 7$, the degree is considered 0, as it corresponds to $x^0$.
+- The degree of the zero polynomial, $f(x) = 0$, where there are no non-zero coefficients, is defined as -1. This special case often represents an "empty" or undefined state in many mathematical contexts.
+
+Example:
+
+```cpp
+auto f = /*some expression*/;
+auto degree_of_f = f.degree();
+```
+
+#### Slicing
+
+```cpp
+// Slicing and selecting even or odd components.
+Polynomial slice(uint64_t offset, uint64_t stride, uint64_t size = 0 /*0 means take all elements*/);
+Polynomial even();
+Polynomial odd();
+```
+
+The Polynomial API provides methods for slicing polynomials and selecting specific components, such as even or odd indexed terms. Slicing allows extracting specific sections of a polynomial based on an offset, stride, and size.
+
+The following examples demonstrate folding a polynomial's even and odd parts and arbitrary slicing;
+
+```cpp
+// folding a polynomials even and odd parts with randomness
+auto x = rand();
+auto even = f.even();
+auto odd = f.odd();
+auto fold_poly = even + odd * x;
+
+// arbitrary slicing (first quarter)
+auto first_quarter = f.slice(0 /*offset*/, 1 /*stride*/, f.degree()/4 /*size*/);
+```
+
+### Memory access (copy/view)
+
+Access to the polynomial's internal state can be vital for operations like commitment schemes or when more efficient custom operations are necessary. This can be done either by copying or viewing the polynomial
+
+#### Copying
+
+Copies the polynomial coefficients to either host or device allocated memory.
+
+:::note
+Copying to host memory is backend agnostic while copying to device memory requires the memory to be allocated on the corresponding backend.
+:::
+
+```cpp
+Coeff get_coeff(uint64_t idx) const; // copy single coefficient to host
+uint64_t copy_coeffs(Coeff* coeffs, uint64_t start_idx, uint64_t end_idx) const;
+```
+
+Example:
+
+```cpp
+auto coeffs_device = /*allocate CUDA or host memory*/
+f.copy_coeffs(coeffs_device, 0/*start*/, f.degree());
+  
+MSMConfig cfg = msm::defaultMSMConfig();
+cfg.are_points_on_device = true; // assuming copy to device memory
+auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);
+```
+
+#### Views
+
+The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
+
+##### What is a Memory View?
+
+A memory view is essentially a pointer to data stored in device memory. By providing a direct access pathway to the data, it eliminates the need for data duplication, thus conserving both time and system resources. This is especially beneficial in high-performance computing environments where data size and operation speed are critical factors.
+
+##### Applications of Memory Views
+
+Memory views are extremely versatile and can be employed in various computational contexts such as:
+
+- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
+- **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.
+
+##### Obtaining and Using Views
+
+To create and use views within the Polynomial API, functions are provided to obtain pointers to both coefficients and evaluation data. Here’s how they are generally structured:
+
+```cpp
+// Obtain a view of the polynomial's coefficients
+std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
+// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
+std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
+get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
+```
+
+Example usage:
+
+```cpp
+auto [coeffs_view, size, device_id] = polynomial.get_coefficients_view();
+
+// Use coeffs_view in a computational routine that requires direct access to polynomial coefficients
+// Example: Passing the view to a GPU-accelerated function
+gpu_accelerated_function(coeffs_view.get(),...);
+```
+
+##### Integrity-Pointer: Managing Memory Views
+
+Within the Polynomial API, memory views are managed through a specialized tool called the Integrity-Pointer. This pointer type is designed to safeguard operations by monitoring the validity of the memory it points to. It can detect if the memory has been modified or released, thereby preventing unsafe access to stale or non-existent data.
+The Integrity-Pointer not only acts as a regular pointer but also provides additional functionality to ensure the integrity of the data it references. Here are its key features:
+
+```cpp
+// Checks whether the pointer is still considered valid
+bool isValid() const;
+
+// Retrieves the raw pointer or nullptr if pointer is invalid
+const T* get() const;
+
+// Dereferences the pointer. Throws exception if the pointer is invalid.
+const T& operator*() const;
+
+//Provides access to the member of the pointed-to object. Throws exception if the pointer is invalid.
+const T* operator->() const;
+```
+
+Consider the Following case:
+
+```cpp
+auto [coeff_view, size, device] = f.get_coefficients_view();
+
+// Use the coefficients view to perform external operations
+commit_to_polynomial(coeff_view.get(), size);
+
+// Modification of the original polynomial
+f += g; // Any operation that modifies 'f' potentially invalidates 'coeff_view'
+
+// Check if the view is still valid before using it further
+if (coeff_view.isValid()) {
+    perform_additional_computation(coeff_view.get(), size);
+} else {
+    handle_invalid_data();
+}
+```
+
+#### Evaluations View: Accessing Polynomial Evaluations Efficiently
+
+The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
+This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
+
+:::warning
+Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
+:::
+
+```cpp
+// Assume a polynomial `p` of degree N
+auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
+
+// Use the evaluations view to perform further computations or visualizations
+process_polynomial_evaluations(evals_view.get(), size, device_id);
+```
+
+## Multi-GPU Support with CUDA Backend
+
+The Polynomial API includes comprehensive support for multi-GPU environments, a crucial feature for leveraging the full computational power of systems equipped with multiple NVIDIA GPUs. This capability is part of the API's CUDA backend, which is designed to efficiently manage polynomial computations across different GPUs.
+
+### Setting the CUDA Device
+
+Like other components of the icicle framework, the Polynomial API allows explicit setting of the current CUDA device:
+
+```cpp
+cudaSetDevice(int deviceID);
+```
+
+This function sets the active CUDA device. All subsequent operations that allocate or deal with polynomial data will be performed on this device.
+
+### Allocation Consistency
+
+Polynomials are always allocated on the current CUDA device at the time of their creation. It is crucial to ensure that the device context is correctly set before initiating any operation that involves memory allocation:
+
+```cpp
+// Set the device before creating polynomials
+cudaSetDevice(0);
+Polynomial p1 = Polynomial::from_coefficients(coeffs, size);
+
+cudaSetDevice(1);
+Polynomial p2 = Polynomial::from_coefficients(coeffs, size);
+```
+
+### Matching Devices for Operations
+
+When performing operations that result in the creation of new polynomials (such as addition or multiplication), it is imperative that both operands are on the same CUDA device. If the operands reside on different devices, an exception is thrown:
+
+```cpp
+// Ensure both operands are on the same device
+cudaSetDevice(0);
+auto p3 = p1 + p2; // Throws an exception if p1 and p2 are not on the same device
+```
+
+### Device-Agnostic Operations
+
+Operations that do not involve the creation of new polynomials, such as computing the degree of a polynomial or performing in-place modifications, can be executed regardless of the current device setting:
+
+```cpp
+// 'degree' and in-place operations do not require device matching
+int deg = p1.degree();
+p1 += p2; // Valid if p1 and p2 are on the same device, throws otherwise
+```
+
+### Error Handling
+
+The API is designed to throw exceptions if operations are attempted across polynomials that are not located on the same GPU. This ensures that all polynomial operations are performed consistently and without data integrity issues due to device mismatches.
+
+### Best Practices
+
+To maximize the performance and avoid runtime errors in a multi-GPU setup, always ensure that:
+
+- The CUDA device is set correctly before polynomial allocation.
+- Operations involving new polynomial creation are performed with operands on the same device.
+
+By adhering to these guidelines, developers can effectively harness the power of multiple GPUs to handle large-scale polynomial computations efficiently.
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -49,13 +49,6 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per

 You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).

-## Supported curves
-
-MSM supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-
 ## Supported Bindings

 - [Golang](../golang-bindings/msm.md)
@@ -81,16 +74,16 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu

 #### When should I use Large triangle accumulation?

-The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.
+The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a **large batch of smaller MSM computations**.

 ## MSM Modes

 ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`

-Batch MSM allows you to run many MSMs with a single API call, Single MSM will launch a single MSM computation.
+Batch MSM allows you to run many MSMs with a single API call while single MSM will launch a single MSM computation.

 ### Which mode should I use?

-This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.
+This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs simultaneously, leveraging the parallel processing capabilities of GPUs.

 Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -11,24 +11,19 @@ A_k = \sum_{n=0}^{N-1} a_n \cdot \omega^{nk} \mod p
 $$

 where:
+
 - $N$ is the size of the input sequence and is a power of 2,
 - $p$ is a prime number such that $p = kN + 1$ for some integer $k$, ensuring that $p$ supports the existence of $N$th roots of unity,
 - $\omega$ is a primitive $N$th root of unity modulo $p$, meaning $\omega^N \equiv 1 \mod p$ and no smaller positive power of $\omega$ is congruent to 1 modulo $p$,
 - $k$ ranges from 0 to $N-1$, and it indexes the output sequence.

-The NTT is particularly useful because it enables efficient polynomial multiplication under modulo arithmetic, crucial for algorithms in cryptographic protocols, and other areas requiring fast modular arithmetic operations. 
+NTT is particularly useful because it enables efficient polynomial multiplication under modulo arithmetic, crucial for algorithms in cryptographic protocols and other areas requiring fast modular arithmetic operations.

 There exists also INTT which is the inverse operation of NTT. INTT can take as input an output sequence of integers from an NTT and reconstruct the original sequence.

-# Using NTT
+## Using NTT

-### Supported curves
-
-NTT supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Supported Bindings
+### Supported Bindings

 - [Golang](../golang-bindings/ntt.md)
 - [Rust](../rust-bindings/ntt.md)
@@ -61,19 +56,17 @@ Choosing an algorithm is heavily dependent on your use case. For example Cooley-

 NTT also supports two different modes `Batch NTT` and `Single NTT`

-Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.
-
 Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.

-**Single NTT Mode**
+#### Single NTT

- Choose this mode when your application requires processing individual NTT operations in isolation.
+Single NTT will launch a single NTT computation.

-**Batch NTT Mode**
+Choose this mode when your application requires processing individual NTT operations in isolation.

- Batch NTT mode can significantly reduce read/write as well as computation overhead by executing multiple NTT operations in parallel.
+#### Batch NTT Mode

- Batch mode may also offer better utilization of computational resources (memory and compute).
+Batch NTT allows you to run many NTTs with a single API call. Batch NTT mode can significantly reduce read/write times as well as computation overhead by executing multiple NTT operations in parallel. Batch mode may also offer better utilization of computational resources (memory and compute).

 ## Supported algorithms

@@ -90,8 +83,8 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob
   The algorithm recursively divides the input sequence into smaller sequences. At each step, it separates the sequence into even-indexed and odd-indexed elements, forming two subsequences that are then processed independently.

 3. **Butterfly Operations:**
-   The core computational element of the Radix-2 NTT is the "butterfly" operation, which combines pairs of elements from the sequences obtained in the decomposition step. 
-   
+   The core computational element of the Radix-2 NTT is the "butterfly" operation, which combines pairs of elements from the sequences obtained in the decomposition step.
+
   Each butterfly operation involves multiplication by a "twiddle factor," which is a root of unity in the finite field, and addition or subtraction of the results, all performed modulo the prime modulus.

   $$
@@ -108,7 +101,6 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob

   $k$ - The index of the current operation within the butterfly or the transform stage

-
   The twiddle factors are precomputed to save runtime and improve performance.

 4. **Bit-Reversal Permutation:**
@@ -116,7 +108,7 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob

 ### Mixed Radix

-The Mixed Radix NTT algorithm extends the concepts of the Radix-2 algorithm by allowing the decomposition of the input sequence based on various factors of its length. Specifically ICICLEs implementation splits the input into blocks of sizes 16,32,64 compared to radix2 which is always splitting such that we end with NTT of size 2. This approach offers enhanced flexibility and efficiency, especially for input sizes that are composite numbers, by leveraging the "divide and conquer" strategy across multiple radixes.
+The Mixed Radix NTT algorithm extends the concepts of the Radix-2 algorithm by allowing the decomposition of the input sequence based on various factors of its length. Specifically ICICLEs implementation splits the input into blocks of sizes 16, 32, or 64 compared to radix2 which is always splitting such that we end with NTT of size 2. This approach offers enhanced flexibility and efficiency, especially for input sizes that are composite numbers, by leveraging the "divide and conquer" strategy across multiple radices.

 The NTT blocks in Mixed Radix are implemented more efficiently based on winograd NTT but also optimized memory and register usage is better compared to Radix-2.

@@ -126,11 +118,11 @@ Mixed Radix can reduce the number of stages required to compute for large inputs
   The input to the Mixed Radix NTT is a sequence of integers $a_0, a_1, \ldots, a_{N-1}$, where $N$ is not strictly required to be a power of two. Instead, $N$ can be any composite number, ideally factorized into primes or powers of primes.

 2. **Factorization and Decomposition:**
-   Unlike the Radix-2 algorithm, which strictly divides the computational problem into halves, the Mixed Radix NTT algorithm implements a flexible decomposition approach which isn't limited to prime factorization. 
-   
+   Unlike the Radix-2 algorithm, which strictly divides the computational problem into halves, the Mixed Radix NTT algorithm implements a flexible decomposition approach which isn't limited to prime factorization.
+
   For example, an NTT of size 256 can be decomposed into two stages of $16 \times \text{NTT}_{16}$, leveraging a composite factorization strategy rather than decomposing into eight stages of $\text{NTT}_{2}$. This exemplifies the use of composite factors (in this case, $256 = 16 \times 16$) to apply smaller NTT transforms, optimizing computational efficiency by adapting the decomposition strategy to the specific structure of $N$.

-3. **Butterfly Operations with Multiple Radixes:**
+3. **Butterfly Operations with Multiple Radices:**
   The Mixed Radix algorithm utilizes butterfly operations for various radix sizes. Each sub-transform involves specific butterfly operations characterized by multiplication with twiddle factors appropriate for the radix in question.

   The generalized butterfly operation for a radix-$r$ element can be expressed as:
@@ -139,7 +131,15 @@ Mixed Radix can reduce the number of stages required to compute for large inputs
   X_{k,r} = \sum_{j=0}^{r-1} (A_{j,k} \cdot W^{jk}) \mod p
   $$

-   where $X_{k,r}$ is the output of the $radix-r$ butterfly operation for the $k-th$ set of inputs, $A_{j,k}$ represents the $j-th$ input element for the $k-th$ operation, $W$ is the twiddle factor, and $p$ is the prime modulus.
+   where:
+
+   $X_{k,r}$ - is the output of the $radix-r$ butterfly operation for the $k-th$ set of inputs
+
+   $A_{j,k}$ - represents the $j-th$ input element for the $k-th$ operation
+
+   $W$ - is the twiddle factor
+
+   $p$ - is the prime modulus

 4. **Recombination and Reordering:**
   After applying the appropriate butterfly operations across all decomposition levels, the Mixed Radix algorithm recombines the results into a single output sequence. Due to the varied sizes of the sub-transforms, a more complex reordering process may be required compared to Radix-2. This involves digit-reversal permutations to ensure that the final output sequence is correctly ordered.
@@ -154,6 +154,6 @@ Mixed radix on the other hand works better for larger NTTs with larger input siz

 Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

-For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/774250926c00ffe84548bc7dd97aea5227afed7e/icicle/appUtils/ntt/ntt.cu#L474) which should choose the most efficient algorithm in most cases. 
+For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/main/icicle/src/ntt/ntt.cu#L573) which should choose the most efficient algorithm in most cases.

 We still recommend you benchmark for your specific use case if you think a different configuration would yield better results.
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -8,39 +8,38 @@ Poseidon has been used in many popular ZK protocols such as Filecoin and [Plonk]

 Our implementation of Poseidon is implemented in accordance with the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/).

-Let understand how Poseidon works.
+Lets understand how Poseidon works.

-### Initialization
+## Initialization

-Poseidon starts with the initialization of its internal state, which is composed of the input elements and some pregenerated constants. An initial round constant is added to each element of the internal state. Adding The round constants ensure the state is properly mixed from the outset.
+Poseidon starts with the initialization of its internal state, which is composed of the input elements and some pre-generated constants. An initial round constant is added to each element of the internal state. Adding the round constants ensures the state is properly mixed from the beginning.

 This is done to prevent collisions and to prevent certain cryptographic attacks by ensuring that the internal state is sufficiently mixed and unpredictable.

 ![Alt text](image.png)

-### Applying full and partial rounds
+## Applying full and partial rounds

-To generate a secure hash output, the algorithm goes through a series of "full rounds" and "partial rounds" as well as transformations between these sets of rounds.
+To generate a secure hash output, the algorithm goes through a series of "full rounds" and "partial rounds" as well as transformations between these sets of rounds in the following order:

-First full rounds => apply SBox and Round constants => partial rounds => Last full rounds => Apply SBox
+```First full rounds -> apply S-box and Round constants -> partial rounds -> Last full rounds -> Apply S-box```

-#### Full rounds
+### Full rounds

 ![Alt text](image-1.png)

-**Uniform Application of S-Box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`, `α` can be different depending on the the implementation and user configuration.
+**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the the implementation and user configuration.

 **Linear Transformation:** After applying the S-box, a linear transformation is performed on the state. This involves multiplying the state by a MDS (Maximum Distance Separable) Matrix. which further diffuses the transformations applied by the S-box across the entire state.

 **Addition of Round Constants:** Each element of the state is then modified by adding a unique round constant. These constants are different for each round and are precomputed as part of the hash function's initialization. The addition of round constants ensures that even minor changes to the input produce significant differences in the output.

-#### Partial Rounds
+### Partial Rounds

 **Selective Application of S-Box:** Partial rounds apply the S-box transformation to only one element of the internal state per round, rather than to all elements. This selective application significantly reduces the computational complexity of the hash function without compromising its security. The choice of which element to apply the S-box to can follow a specific pattern or be fixed, depending on the design of the hash function.

 **Linear Transformation and Round Constants:** A linear transformation is performed and round constants are added. The linear transformation in partial rounds can be designed to be less computationally intensive (this is done by using a sparse matrix) than in full rounds, further optimizing the function's efficiency.

-
 The user of Poseidon can often choose how many partial or full rounds he wishes to apply; more full rounds will increase security but degrade performance. The choice and balance is highly dependent on the use case.

 ![Alt text](image-2.png)
@@ -52,25 +51,20 @@ What that means is we calculate multiple hash-sums over multiple pre-images in p

 So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elements of output. Which means each block would be of size 2 and that would result in 1024 Poseidon hashes being performed.

-### Supported API
+### Supported Bindings

-[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon), [`C++`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon)
-
-### Supported curves
-
-Poseidon supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
+[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)

 ### Constants

 Poseidon is extremely customizable and using different constants will produce different hashes, security levels and performance results.

-We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.
+We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.

-If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/blob/b6dded89cdef18348a5d4e2748b71ce4211c63ad/icicle/appUtils/poseidon/constants/generate_parameters.py#L1).
+If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants/generate_parameters.py).

 Prerequisites:
+
 - Install python 3
 - `pip install poseidon-hash`
 - `pip install galois==0.3.7`
@@ -97,7 +91,7 @@ primitive_element = 7 # bls12-381
 # primitive_element = 15 # bw6-761
 ```

-We only support `alpha = 5` so if you want to use another alpha for SBox please reach out on discord or open a github issue.
+We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.

 ### Rust API

@@ -128,8 +122,7 @@ poseidon_hash_many::<F>(

 The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.

-
-```
+```rust
 impl<'a> Default for PoseidonConfig<'a> {
    fn default() -> Self {
        let ctx = get_default_device_context();
@@ -174,11 +167,10 @@ let ctx = get_default_device_context();
    )
    .unwrap();
 ```
-For more examples using different configurations refer here.

 ## The Tree Builder

-The tree builder allows you to build Merkle trees using Poseidon. 
+The tree builder allows you to build Merkle trees using Poseidon.

 You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.

@@ -206,9 +198,9 @@ Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig:
 - `are_inputs_on_device`: Have the inputs been loaded to device memory ?
 - `is_async`: Should the TreeBuilder run asynchronously? `False` will block the current CPU thread. `True` will require you call `cudaStreamSynchronize` or `cudaDeviceSynchronize` to retrieve the result.

-### Benchmarks 
+### Benchmarks

-We ran the Poseidon tree builder on: 
+We ran the Poseidon tree builder on:

 **CPU**: 12th Gen Intel(R) Core(TM) i9-12900K/

@@ -218,9 +210,8 @@ We ran the Poseidon tree builder on:

 The benchmarks include copying data from and to the device.

-
 | Rows to keep parameter      | Run time, Icicle | Supranational PC2
-| ----------- | ----------- | ----------- |  
+| ----------- | ----------- | -----------
 | 10          | 9.4 seconds       |    13.6 seconds
 | 20          | 9.5 seconds       |    13.6 seconds
 | 29          | 13.7 seconds       |    13.6 seconds
--- a/docs/docs/icicle/rust-bindings.md
+++ b/docs/docs/icicle/rust-bindings.md
@@ -12,7 +12,7 @@ Rust bindings allow you to use ICICLE as a rust library.

 Simply add the following to your `Cargo.toml`.

-```
+```toml
 # GPU Icicle integration
 icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git" }
 icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git" }
@@ -25,7 +25,7 @@ If you wish to point to a specific ICICLE branch add `branch = "<name_of_branch>

 When you build your project ICICLE will be built as part of the build command.

-# How do the rust bindings work?
+## How do the rust bindings work?

 The rust bindings are just rust wrappers for ICICLE Core static libraries which can be compiled. We integrate the compilation of the static libraries into rusts toolchain to make usage seamless and easy. This is achieved by [extending rusts build command](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs).

@@ -55,3 +55,33 @@ fn main() {
    println!("cargo:rustc-link-lib=cudart");
 }
 ```
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+| Poseidon | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Merkle Tree | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+### Supported fields and operations
+
+| Operation\Field | babybear | stark252 |
+| --- | :---: | :---: |
+| VecOps | ✅ | ✅ |
+| Polynomials | ✅ | ✅ |
+| NTT | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
+
+### Supported hashes
+
+| Hash | Sizes |
+| --- | :---: |
+| Keccak | 256, 512 |
--- a/docs/docs/icicle/rust-bindings/ecntt.md
+++ b/docs/docs/icicle/rust-bindings/ecntt.md
@@ -0,0 +1,31 @@
+# ECNTT
+
+## ECNTT Method
+
+The `ecntt` function computes the Elliptic Curve Number Theoretic Transform (EC-NTT) or its inverse on a batch of points of a curve.
+
+```rust
+pub fn ecntt<C: Curve>(
+    input: &(impl HostOrDeviceSlice<Projective<C>> + ?Sized),
+    dir: NTTDir,
+    cfg: &NTTConfig<C::ScalarField>,
+    output: &mut (impl HostOrDeviceSlice<Projective<C>> + ?Sized),
+) -> IcicleResult<()>
+where
+    C::ScalarField: FieldImpl,
+    <C::ScalarField as FieldImpl>::Config: ECNTT<C>,
+{
+    // ... function implementation ...
+}
+```
+
+## Parameters
+
+- **`input`**: The input data as a slice of `Projective<C>`. This represents points on a specific elliptic curve `C`.
+- **`dir`**: The direction of the NTT. It can be `NTTDir::kForward` for forward NTT or `NTTDir::kInverse` for inverse NTT.
+- **`cfg`**: The NTT configuration object of type `NTTConfig<C::ScalarField>`. This object specifies parameters for the NTT computation, such as the batch size and algorithm to use.
+- **`output`**: The output buffer to write the results into. This should be a slice of `Projective<C>` with the same size as the input.
+
+## Return Value
+
+- **`IcicleResult<()>`**: This function returns an `IcicleResult` which is a wrapper type that indicates success or failure of the NTT computation. On success, it contains `Ok(())`.
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -0,0 +1,58 @@
+# MSM Pre computation
+
+To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
+
+## `precompute_bases`
+
+Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
+
+```rust
+pub fn precompute_bases<C: Curve + MSM<C>>(
+    points: &HostOrDeviceSlice<Affine<C>>,
+    precompute_factor: i32,
+    _c: i32,
+    ctx: &DeviceContext,
+    output_bases: &mut HostOrDeviceSlice<Affine<C>>,
+) -> IcicleResult<()>
+```
+
+### Parameters
+
+- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
+- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
+- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
+- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
+- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
+
+#### Returns
+
+`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
+
+#### Description
+
+This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
+
+The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
+
+#### Example Usage
+
+```rust
+let device_context = DeviceContext::default_for_device(0); // Use the default device
+let precompute_factor = 4; // Number of points to precompute
+let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");
+
+// Precompute the bases using the specified factor
+precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
+    .expect("Failed to precompute bases");
+```
+
+### Benchmarks
+
+Benchmarks where performed on a Nvidia RTX 3090Ti.
+
+| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
+| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
+| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
+| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -1,9 +1,5 @@
 # MSM

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
 ## Example

 ```rust
@@ -84,7 +80,7 @@ pub struct MSMConfig<'a> {
 ```

 - **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
- **`point_size: i32`**: 
+- **`point_size: i32`**:
 - **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
 - **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
 - **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
@@ -120,7 +116,6 @@ msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();

 You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).

-
 ## How do I toggle between MSM modes?

 Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
@@ -136,7 +131,6 @@ msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();

 In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.

-
 In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.

 ```rust
@@ -152,7 +146,7 @@ Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961

 ## Support for G2 group

-MSM also supports G2 group. 
+MSM also supports G2 group.

 Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.

--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -62,11 +62,11 @@ Sets the current CUDA device by its ID, when calling `set_device` it will set th

 **Parameters:**

- `device_id: usize`: The ID of the device to set as the current device. Device IDs start from 0.
+- **`device_id: usize`**: The ID of the device to set as the current device. Device IDs start from 0.

 **Returns:**

- `CudaResult<()>`: An empty result indicating success if the device is set successfully. In case of failure, returns a `CudaError`.
+- **`CudaResult<()>`**: An empty result indicating success if the device is set successfully. In case of failure, returns a `CudaError`.

 **Errors:**

@@ -88,7 +88,7 @@ Retrieves the number of CUDA devices available on the machine.

 **Returns:**

- `CudaResult<usize>`: The number of available CUDA devices. On success, contains the count of CUDA devices. On failure, returns a `CudaError`.
+- **`CudaResult<usize>`**: The number of available CUDA devices. On success, contains the count of CUDA devices. On failure, returns a `CudaError`.

 **Errors:**

@@ -109,7 +109,7 @@ Retrieves the ID of the current CUDA device.

 **Returns:**

- `CudaResult<usize>`: The ID of the current CUDA device. On success, contains the device ID. On failure, returns a `CudaError`.
+- **`CudaResult<usize>`**: The ID of the current CUDA device. On success, contains the device ID. On failure, returns a `CudaError`.

 **Errors:**

@@ -191,7 +191,7 @@ Validates that the specified `device_id` matches the ID of the currently active

 #### Behavior

- **Panics** if the `device_id` does not match the active device's ID, preventing cross-device operation errors.
+- **`Panics`** if the `device_id` does not match the active device's ID, preventing cross-device operation errors.

 #### Example

--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -1,10 +1,6 @@
 # NTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Example 
+## Example

 ```rust
 use icicle_bn254::curve::{ScalarCfg, ScalarField};
@@ -29,7 +25,7 @@ fn main() {
    // Create a CUDA stream
    let stream = CudaStream::create().expect("Failed to create CUDA stream");
    let ctx = DeviceContext::default(); // Assuming default device context
-    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();

    // Configure NTT
    let mut cfg = ntt::NTTConfig::default();
@@ -61,14 +57,13 @@ pub fn ntt<F>(

 `ntt:ntt` expects:

-`input` - buffer to read the inputs of the NTT from. <br/>
-`dir` - whether to compute forward or inverse NTT. <br/>
-`cfg` - config used to specify extra arguments of the NTT. <br/>
-`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
+- **`input`** - buffer to read the inputs of the NTT from.
+- **`dir`** - whether to compute forward or inverse NTT.
+- **`cfg`** - config used to specify extra arguments of the NTT.
+- **`output`** - buffer to write the NTT outputs into. Must be of the same  size as input.

 The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.

-
 ### NTT Config

 ```rust
@@ -76,6 +71,7 @@ pub struct NTTConfig<'a, S> {
    pub ctx: DeviceContext<'a>,
    pub coset_gen: S,
    pub batch_size: i32,
+    pub columns_batch: bool,
    pub ordering: Ordering,
    are_inputs_on_device: bool,    
    are_outputs_on_device: bool,
@@ -94,6 +90,8 @@ The `NTTConfig` struct is a configuration object used to specify parameters for

 - **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.

+- **`columns_batch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+
 - **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.

 - **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
@@ -104,8 +102,7 @@ The `NTTConfig` struct is a configuration object used to specify parameters for

 - **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
 `Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
+`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to

 #### Usage

@@ -122,6 +119,7 @@ let custom_config = NTTConfig {
    ctx: custom_device_context,
    coset_gen: my_coset_generator,
    batch_size: 10,
+    columns_batch: false,
    ordering: Ordering::kRN,
    are_inputs_on_device: true,
    are_outputs_on_device: true,
@@ -130,7 +128,6 @@ let custom_config = NTTConfig {
 };
 ```

-
 ### Modes

 NTT supports two different modes `Batch NTT` and `Single NTT`
@@ -151,13 +148,13 @@ Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your
 Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.

 ```rust
-ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
 ```

 ### `initialize_domain`

 ```rust
-pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
+pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext, fast_twiddles: bool) -> IcicleResult<()>
 where
    F: FieldImpl,
    <F as FieldImpl>::Config: NTT<F>;
@@ -173,23 +170,31 @@ where

 - **`IcicleResult<()>`**: Will return an error if the operation fails.

-### `initialize_domain_fast_twiddles_mode`
+#### Parameters

-Similar to `initialize_domain`, `initialize_domain_fast_twiddles_mode` is a faster implementation and can be used for larger NTTs.
+- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
+
+- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
+
+#### Returns
+
+- **`IcicleResult<()>`**: Will return an error if the operation fails.
+
+### Releaseing the domain
+
+The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.

 ```rust
-pub fn initialize_domain_fast_twiddles_mode<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
+pub fn release_domain<F>(ctx: &DeviceContext) -> IcicleResult<()>
 where
    F: FieldImpl,
-    <F as FieldImpl>::Config: NTT<F>;
+    <F as FieldImpl>::Config: NTT<F>
 ```

 #### Parameters

- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
 - **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.

 #### Returns

- **`IcicleResult<()>`**: Will return an error if the operation fails.
+The function returns an `IcicleResult<()>`, which represents the result of the operation. If the operation is successful, the function returns `Ok(())`, otherwise it returns an error.
--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -0,0 +1,279 @@
+# Rust FFI Bindings for Univariate Polynomial
+
+:::note
+Please refer to the Polynomials overview page for a deep overview. This section is a brief description of the Rust FFI bindings.
+:::
+
+This documentation is designed to provide developers with a clear understanding of how to utilize the Rust bindings for polynomial operations efficiently and effectively, leveraging the robust capabilities of both Rust and C++ in their applications.
+
+## Introduction
+
+The Rust FFI bindings for the Univariate Polynomial serve as a "shallow wrapper" around the underlying C++ implementation. These bindings provide a straightforward Rust interface that directly calls functions from a C++ library, effectively bridging Rust and C++ operations. The Rust layer handles simple interface translations without delving into complex logic or data structures, which are managed on the C++ side. This design ensures efficient data handling, memory management, and execution of polynomial operations directly via C++.
+Currently, these bindings are tailored specifically for polynomials where the coefficients, domain, and images are represented as scalar fields.
+
+## Initialization Requirements
+
+Before utilizing any functions from the polynomial API, it is mandatory to initialize the appropriate polynomial backend (e.g., CUDA). Additionally, the NTT (Number Theoretic Transform) domain must also be initialized, as the CUDA backend relies on this for certain operations. Failing to properly initialize these components can result in errors.
+
+:::note
+**Field-Specific Initialization Requirement**
+
+The ICICLE library is structured such that each field or curve has its dedicated library implementation. As a result, initialization must be performed individually for each field or curve to ensure the correct setup and functionality of the library.
+:::
+
+## Core Trait: `UnivariatePolynomial`
+
+The `UnivariatePolynomial` trait encapsulates the essential functionalities required for managing univariate polynomials in the Rust ecosystem. This trait standardizes the operations that can be performed on polynomials, regardless of the underlying implementation details. It allows for a unified approach to polynomial manipulation, providing a suite of methods that are fundamental to polynomial arithmetic.
+
+### Trait Definition
+
+```rust
+pub trait UnivariatePolynomial
+where
+    Self::Field: FieldImpl,
+    Self::FieldConfig: FieldConfig,
+{
+    type Field: FieldImpl;
+    type FieldConfig: FieldConfig;
+
+    // Methods to create polynomials from coefficients or roots-of-unity evaluations.
+    fn from_coeffs<S: HostOrDeviceSlice<Self::Field> + ?Sized>(coeffs: &S, size: usize) -> Self;
+    fn from_rou_evals<S: HostOrDeviceSlice<Self::Field> + ?Sized>(evals: &S, size: usize) -> Self;
+
+    // Method to divide this polynomial by another, returning quotient and remainder.
+    fn divide(&self, denominator: &Self) -> (Self, Self) where Self: Sized;
+
+    // Method to divide this polynomial by the vanishing polynomial 'X^N-1'.
+    fn div_by_vanishing(&self, degree: u64) -> Self;
+
+    // Methods to add or subtract a monomial in-place.
+    fn add_monomial_inplace(&mut self, monomial_coeff: &Self::Field, monomial: u64);
+    fn sub_monomial_inplace(&mut self, monomial_coeff: &Self::Field, monomial: u64);
+
+    // Method to slice the polynomial, creating a sub-polynomial.
+    fn slice(&self, offset: u64, stride: u64, size: u64) -> Self;
+
+    // Methods to return new polynomials containing only the even or odd terms.
+    fn even(&self) -> Self;
+    fn odd(&self) -> Self;
+
+    // Method to evaluate the polynomial at a given domain point.
+    fn eval(&self, x: &Self::Field) -> Self::Field;
+
+    // Method to evaluate the polynomial over a domain and store the results.
+    fn eval_on_domain<D: HostOrDeviceSlice<Self::Field> + ?Sized, E: HostOrDeviceSlice<Self::Field> + ?Sized>(
+        &self,
+        domain: &D,
+        evals: &mut E,
+    );
+
+    // Method to retrieve a coefficient at a specific index.
+    fn get_coeff(&self, idx: u64) -> Self::Field;
+
+    // Method to copy coefficients into a provided slice.
+    fn copy_coeffs<S: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, start_idx: u64, coeffs: &mut S);
+
+    // Method to get the degree of the polynomial.
+    fn degree(&self) -> i64;
+}
+```
+
+## `DensePolynomial` Struct
+
+The DensePolynomial struct represents a dense univariate polynomial in Rust, leveraging a handle to manage its underlying memory within the CUDA device context. This struct acts as a high-level abstraction over complex C++ memory management practices, facilitating the integration of high-performance polynomial operations through Rust's Foreign Function Interface (FFI) bindings.
+
+```rust
+pub struct DensePolynomial {
+    handle: PolynomialHandle,
+}
+```
+
+### Traits implementation and methods
+
+#### `Drop`
+
+Ensures proper resource management by releasing the CUDA memory when a DensePolynomial instance goes out of scope. This prevents memory leaks and ensures that resources are cleaned up correctly, adhering to Rust's RAII (Resource Acquisition Is Initialization) principles.
+
+#### `Clone`
+
+Provides a way to create a new instance of a DensePolynomial with its own unique handle, thus duplicating the polynomial data in the CUDA context. Cloning is essential since the DensePolynomial manages external resources, which cannot be safely shared across instances without explicit duplication.
+
+#### Operator Overloading: `Add`, `Sub`, `Mul`, `Rem`, `Div`
+
+These traits are implemented for references to DensePolynomial (i.e., &DensePolynomial), enabling natural mathematical operations such as addition (+), subtraction (-), multiplication (*), division (/), and remainder (%). This syntactic convenience allows users to compose complex polynomial expressions in a way that is both readable and expressive.
+
+#### Key Methods
+
+In addition to the traits, the following methods are implemented:
+
+```rust
+impl DensePolynomial {
+    pub fn init_cuda_backend() -> bool {...}
+    // Returns a mutable slice of the polynomial coefficients on the device
+    pub fn coeffs_mut_slice(&mut self) -> &mut DeviceSlice<F> {...}
+}      
+```
+
+## Flexible Memory Handling With `HostOrDeviceSlice`
+
+The DensePolynomial API is designed to accommodate a wide range of computational environments by supporting both host and device memory through the `HostOrDeviceSlice` trait. This approach ensures that polynomial operations can be seamlessly executed regardless of where the data resides, making the API highly adaptable and efficient for various hardware configurations.
+
+### Overview of `HostOrDeviceSlice`
+
+The HostOrDeviceSlice is a Rust trait that abstracts over slices of memory that can either be on the host (CPU) or the device (GPU), as managed by CUDA. This abstraction is crucial for high-performance computing scenarios where data might need to be moved between different memory spaces depending on the operations being performed and the specific hardware capabilities available.
+
+### Usage in API Functions
+
+Functions within the DensePolynomial API that deal with polynomial coefficients or evaluations use the HostOrDeviceSlice trait to accept inputs. This design allows the functions to be agnostic of the actual memory location of the data, whether it's in standard system RAM accessible by the CPU or in GPU memory accessible by CUDA cores.
+
+```rust
+// Assume `coeffs` could either be in host memory or CUDA device memory
+let coeffs: DeviceSlice<F> = DeviceVec::<F>::cuda_malloc(coeffs_len).unwrap();
+let p_from_coeffs = PolynomialBabyBear::from_coeffs(&coeffs, coeffs.len());
+
+// Similarly for evaluations from roots of unity
+let evals: HostSlice<F> = HostSlice::from_slice(&host_memory_evals);
+let p_from_evals = PolynomialBabyBear::from_rou_evals(&evals, evals.len());
+
+// Same applies for any API that accepts HostOrDeviceSlice
+```
+
+## Usage
+
+This section outlines practical examples demonstrating how to utilize the `DensePolynomial` Rust API. The API is flexible, supporting multiple scalar fields. Below are examples showing how to use polynomials defined over different fields and perform a variety of operations.
+
+### Initialization and Basic Operations
+
+First, choose the appropriate field implementation for your polynomial operations, initializing the CUDA backend if necessary
+
+```rust
+use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
+
+// Initialize the CUDA backend for polynomial operations
+PolynomialBabyBear::init_cuda_backend();
+let f = PolynomialBabyBear::from_coeffs(...);
+
+// now use f by calling the implemented traits
+
+// For operations over another field, such as BN254
+use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
+// Use PolynomialBn254 similarly
+```
+
+### Creation
+
+Polynomials can be created from coefficients or evaluations:
+
+```rust
+let coeffs = ...;
+let p_from_coeffs = PolynomialBabyBear::from_coeffs(HostSlice::from_slice(&coeffs), size);
+
+let evals = ...;
+let p_from_evals = PolynomialBabyBear::from_rou_evals(HostSlice::from_slice(&evals), size);
+
+```
+
+### Arithmetic Operations
+
+Utilize overloaded operators for intuitive mathematical expressions:
+
+```rust
+let add = &f + &g;  // Addition
+let sub = &f - &g;  // Subtraction
+let mul = &f * &g;  // Multiplication
+let mul_scalar = &f * &scalar;  // Scalar multiplication
+```
+
+### Division and Remainder
+
+Compute quotient and remainder or perform division by a vanishing polynomial:
+
+```rust
+let (q, r) = f.divide(&g);  // Compute both quotient and remainder
+let q = &f / &g;  // Quotient
+let r = &f % &g;  // Remainder
+
+let h = f.div_by_vanishing(N);  // Division by V(x) = X^N - 1
+
+```
+
+### Monomial Operations
+
+Add or subtract monomials in-place for efficient polynomial manipulation:
+
+```rust
+f.add_monomial_inplace(&three, 1 /*monmoial*/); // Adds 3*x to f
+f.sub_monomial_inplace(&one, 0 /*monmoial*/);   // Subtracts 1 from f
+```
+
+### Slicing
+
+Extract specific components:
+
+```rust
+let even = f.even();  // Polynomial of even-indexed terms
+let odd = f.odd();    // Polynomial of odd-indexed terms
+let arbitrary_slice = f.slice(offset, stride, size);
+```
+
+### Evaluate
+
+Evaluate the polynoomial:
+
+```rust
+let x = rand();  // Random field element
+let f_x = f.eval(&x);  // Evaluate f at x
+
+// Evaluate on a predefined domain
+let domain = [one, two, three];
+let mut host_evals = vec![ScalarField::zero(); domain.len()];
+f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
+```
+
+### Read coefficients
+
+Read or copy polynomial coefficients for further processing:
+
+```rust
+let x_squared_coeff = f.get_coeff(2);  // Coefficient of x^2
+
+// Copy coefficients to a device-specific memory space
+let mut device_mem = DeviceVec::<Field>::cuda_malloc(coeffs.len()).unwrap();
+f.copy_coeffs(0, &mut device_mem[..]);
+```
+
+### Polynomial Degree
+
+Determine the highest power of the variable with a non-zero coefficient:
+
+```rust
+let deg = f.degree();  // Degree of the polynomial
+```
+
+### Memory Management: Views (rust slices)
+
+Rust enforces correct usage of views at compile time, eliminating the need for runtime checks:
+
+```rust
+let mut f = Poly::from_coeffs(HostSlice::from_slice(&coeffs), size);
+
+// Obtain a mutable slice of coefficients as a DeviceSlice
+let coeffs_slice_dev = f.coeffs_mut_slice();
+
+// Operations on f are restricted here due to mutable borrow of coeffs_slice_dev
+
+// Compute evaluations or perform other operations directly using the slice
+// example: evaluate f on a coset of roots-of-unity. Computing from GPU to HOST/GPU
+let mut config: NTTConfig<'_, F> = NTTConfig::default();
+config.coset_gen = /*some coset gen*/;
+let mut coset_evals = vec![F::zero(); coeffs_slice_dev.len()];
+ntt(
+    coeffs_slice_dev,
+    NTTDir::kForward,
+    &config,
+    HostSlice::from_mut_slice(&mut coset_evals),
+)
+.unwrap();
+
+// now can f can be borrowed once again
+```
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,13 +1,6 @@
 # Vector Operations API

-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. 
-
-
-## Supported curves
-
-Vector operations are supported on the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
+Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.

 ## Examples

@@ -59,7 +52,6 @@ let cfg = VecOpsConfig::default();
 mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
 ```

-
 ## Vector Operations Configuration

 The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
@@ -74,7 +66,6 @@ pub struct VecOpsConfig<'a> {
    is_a_on_device: bool,
    is_b_on_device: bool,
    is_result_on_device: bool,
-    is_result_montgomery_form: bool,
    pub is_async: bool,
 }
 ```
@@ -85,14 +76,13 @@ pub struct VecOpsConfig<'a> {
 - **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
 - **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
 - **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
- **`is_result_montgomery_form`**: Determines if the result should be in Montgomery form.
 - **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.

 ### Default Configuration

 `VecOpsConfig` can be initialized with default settings tailored for a specific device:

-```
+```rust
 let cfg = VecOpsConfig::default();
 ```

@@ -112,7 +102,6 @@ impl<'a> VecOpsConfig<'a> {
            is_a_on_device: false,
            is_b_on_device: false,
            is_result_on_device: false,
-            is_result_montgomery_form: false,
            is_async: false,
        }
    }
@@ -121,7 +110,7 @@ impl<'a> VecOpsConfig<'a> {

 ## Vector Operations

-Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
+Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.

 ### `VecOps` Trait

@@ -157,3 +146,62 @@ All operations are element-wise operations, and the results placed into the `res
 - **`add`**: Computes the element-wise sum of two vectors.
 - **`sub`**: Computes the element-wise difference between two vectors.
 - **`mul`**: Performs element-wise multiplication of two vectors.
+
+## MatrixTranspose API Documentation
+
+This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
+
+The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
+
+### Function
+
+```rust
+pub fn transpose_matrix<F>(
+    input: &HostOrDeviceSlice<F>,
+    row_size: u32,
+    column_size: u32,
+    output: &mut HostOrDeviceSlice<F>,
+    ctx: &DeviceContext,
+    on_device: bool,
+    is_async: bool,
+) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>
+```
+
+### Parameters
+
+- **`input`**: A slice representing the input matrix. The slice can be stored on either the host or the device.
+- **`row_size`**: The number of rows in the input matrix.
+- **`column_size`**: The number of columns in the input matrix.
+- **`output`**: A mutable slice to store the transposed matrix. The slice can be stored on either the host or the device.
+- **`ctx`**: A reference to the `DeviceContext`, which provides information about the device where the operation will be performed.
+- **`on_device`**: A boolean flag indicating whether the inputs and outputs are on the device.
+- **`is_async`**: A boolean flag indicating whether the operation should be performed asynchronously.
+
+### Return Value
+
+`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
+
+### Example
+
+```rust
+use icicle::HostOrDeviceSlice;
+use icicle::DeviceContext;
+use icicle::FieldImpl;
+use icicle::VecOps;
+
+let input: HostOrDeviceSlice<i32> = // ...;
+let mut output: HostOrDeviceSlice<i32> = // ...;
+let ctx: DeviceContext = // ...;
+
+transpose_matrix(&input, 5, 4, &mut output, &ctx, true, false)
+    .expect("Failed to transpose matrix");
+```
+
+The function takes a matrix represented as a 1D slice, transposes it, and stores the result in another 1D slice. The input and output slices can be stored on either the host or the device, and the operation can be performed synchronously or asynchronously.
+
+The function is generic and can work with any type `F` that implements the `FieldImpl` trait. The `<F as FieldImpl>::Config` type must also implement the `VecOps<F>` trait, which provides the `transpose` method used to perform the actual transposition.
+
+The function returns an `IcicleResult<()>`, indicating whether the operation was successful or not.
--- a/docs/docs/icicle/supporting-additional-curves.md
+++ b/docs/docs/icicle/supporting-additional-curves.md
@@ -1,117 +0,0 @@
-# Supporting Additional Curves
-
-We understand the need for ZK developers to use different curves, some common some more exotic. For this reason we designed ICICLE to allow developers to add any curve they desire.
-
-## ICICLE Core
-
-ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.
-
-To add support for a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
-
-### Adding curve_name_params.cuh
-
-Start by copying `bn254_params.cuh` contents in your params file. Params should include:
- - **fq_config** - parameters of the Base field.
-    - **limbs_count** - `ceil(field_byte_size / 4)`.
-    - **modulus_bit_count** - bit-size of the modulus.
-    - **num_of_reductions** - the number of times to reduce in reduce function. Use 2 if not sure.
-    - **modulus** - modulus of the field.
-    - **modulus_2** - modulus * 2.
-    - **modulus_4** - modulus * 4. 
-    - **neg_modulus** - negated modulus. 
-    - **modulus_wide** - modulus represented as a double-sized integer.
-    - **modulus_squared** - modulus**2 represented as a double-sized integer.
-    - **modulus_squared_2** - 2 * modulus**2 represented as a double-sized integer.
-    - **modulus_squared_4** - 4 * modulus**2 represented as a double-sized integer.
-    - **m** - value used in multiplication. Can be computed as `2**(2*modulus_bit_count) // modulus`. 
-    - **one** - multiplicative identity. 
-    - **zero** - additive identity. 
-    - **montgomery_r** - `2 ** M % modulus` where M is a closest (larger than) bitsize multiple of 32. E.g. 384 or 768 for bls and bw curves respectively
-    - **montgomery_r_inv** - `2 ** (-M) % modulus`
- - **fp_config** - parameters of the Scalar field.
-    Same as fq_config, but with additional arguments:
-    - **omegas_count** - [two-adicity](https://cryptologie.net/article/559/whats-two-adicity/) of the field. And thus the maximum size of NTT.
-    - **omegas** - an array of omegas for NTTs. An array of size `omegas_count`. The ith element is equal to `1.nth_root(2**(2**(omegas_count-i)))`.
-    - **inv** - an array of inverses of powers of two in a field. Ith element is equal to `(2 ** (i+1)) ** -1`.
- - **G1 generators points** - affine coordinates of the generator point.
- - **G2 generators points** - affine coordinates of the extension generator. Remove these if `G2` is not supported.
- - **Weierstrass b value** - base field element equal to value of `b` in the curve equation.
- - **Weierstrass b value G2** - base field element equal to value of `b` for the extension. Remove this if `G2` is not supported.
- 
- :::note
-
- All the params are not in Montgomery form.
- 
- :::
- 
- :::note
-
- To convert number values into `storage` type you can use the following python function
-
-```python
-import struct
-
-def unpack(x, field_size):
-    return ', '.join(["0x" + format(x, '08x') for x in struct.unpack('I' * (field_size) // 4, int(x).to_bytes(field_size, 'little'))])
-```
-
-:::
-
-We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.
-
-```
-...
-
-#define BN254     1
-#define BLS12_381 2
-#define BLS12_377 3
-#define BW6_761   4
-#define GRUMPKIN  5
-#define <curve_name> 6
-
-...
-```
-
-Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/icicle/curves/curve_config.cuh#L16-L29) accordingly.
-
-Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.
-
-```
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin;<curve_name>)
-```
-
-### Adding Poseidon support
-
-If you want your curve to implement a Poseidon hash function or a tree builder, you will need to pre-calculate its optimized parameters.  
-Copy [constants_template.h](https://github.com/ingonyama-zk/icicle/blob/main/icicle/appUtils/poseidon/constants/constants_template.h) into `icicle/appUtils/poseidon/constants/<CURVE>_poseidon.h`. Run the [constants generation script](https://dev.ingonyama.com/icicle/primitives/poseidon#constants). The script will print the number of partial rounds and generate a `constants.bin` file. Use `xxd -i constants.bin` to parse the file into C declarations. Copy the `unsigned char constants_bin[]` contents inside your new file. Repeat this process for arities 2, 4, 8 and 11.
-
-After you've generated the constants, add your curve in this [SUPPORTED_CURVES_WITH_POSEIDON](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L72) in the `CMakeLists.txt`.
-
-## Bindings
-
-In order to support a new curve in the binding libraries you first must support it in ICICLE core.
-
-### Rust
-
-Go to [rust curves folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves) and copy `icicle-curve-template` to a new folder named `icicle-<curve_name>`.
-
-Find all the occurrences of `<CURVE>` placeholder inside the crate. (You can use `Ctrl+Shift+F` in VS Code or `grep -nr "<CURVE>"` in bash). You will then need to replace each occurrence with your new curve name.
-
-#### Limbs
-
-Go to your curve's `curve.rs` file and set `SCALAR_LIMBS`, `BASE_LIMBS` and `G2_BASE_LIMBS` (if G2 is needed) to a minimum number of `u64` required to store a single scalar field / base field element respectively.  
-e.g. for bn254, scalar field is 254 bit so `SCALAR_LIMBS` is set to 4.
-
-#### Primitives
-
-If your curve doesn't support some of the primitives (ntt/msm/poseidon/merkle tree/), or you simply don't want to include it, just remove a corresponding module from `src` and then from `lib.rs`
-
-#### G2
-
-If your curve doesn't support G2 - remove all the code under `#[cfg(feature = "g2")]` and remove the feature from [Cargo.toml](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml#L29) and [build.rs](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs#L15).
-
-After this is done, add your new crate in the [global Cargo.toml](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/Cargo.toml).
-
-### Golang
-
-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docs/introduction.md
+++ b/docs/docs/introduction.md
@@ -11,7 +11,7 @@ Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge
 Currently our flagship products are:

 - **ICICLE**:
-  [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your ZK existing protocols in a matter of hours or implement your protocol from scratch on GPU.
+  [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your existing ZK protocols in a matter of hours or implement your protocol from scratch on GPU.

 ---

@@ -39,7 +39,7 @@ Learn more about ICICLE and GPUs [here][ICICLE-OVERVIEW].

 ## Get in Touch

-If you have any questions, ideas, or are thinking of building something in this space join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).
+If you have any questions, ideas, or are thinking of building something in this space, join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).

 Follow us on [Twitter](https://x.com/Ingo_zk) and [YouTube](https://www.youtube.com/@ingo_ZK) and sign up for our [mailing list](https://wkf.ms/3LKCbdj) to get our latest announcements.

--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -21,65 +21,8 @@ module.exports = {
        },
        {
          type: "doc",
-          label: "ICICLE Provers",
-          id: "icicle/integrations"
-        },
-        {
-          type: "category",
-          label: "Golang bindings",
-          link: {
-            type: `doc`,
-            id: "icicle/golang-bindings",
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/golang-bindings/msm",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/golang-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/golang-bindings/vec-ops",
-            },
-          ]
-        },
-        {
-          type: "category",
-          label: "Rust bindings",
-          link: {
-            type: `doc`,
-            id: "icicle/rust-bindings",
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/rust-bindings/msm",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/rust-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/rust-bindings/vec-ops",
-            },
-            {
-              type: "doc",
-              label: "Multi GPU Support",
-              id: "icicle/rust-bindings/multi-gpu",
-            },
-          ],
+          label: "ICICLE Core",
+          id: "icicle/core",
        },
        {
          type: "category",
@@ -107,21 +50,125 @@ module.exports = {
            },
          ],
        },
+        {
+          type: "doc",
+          label: "Polynomials",
+          id: "icicle/polynomials/overview",
+        },
        {
          type: "doc",
          label: "Multi GPU Support",
          id: "icicle/multi-gpu",
        },
        {
-          type: "doc",
-          label: "Supporting additional curves",
-          id: "icicle/supporting-additional-curves",
+          type: "category",
+          label: "Golang bindings",
+          link: {
+            type: `doc`,
+            id: "icicle/golang-bindings",
+          },
+          collapsed: true,
+          items: [
+            {
+              type: "category",
+              label: "MSM",
+              link: {
+                type: `doc`,
+                id: "icicle/golang-bindings/msm",
+              },
+              collapsed: true,
+              items: [
+                {
+                  type: "doc",
+                  label: "MSM pre computation",
+                  id: "icicle/golang-bindings/msm-pre-computation",
+                }
+              ]
+            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/golang-bindings/ntt",
+            },
+            {
+              type: "doc",
+              label: "EC-NTT",
+              id: "icicle/golang-bindings/ecntt",
+            },
+            {
+              type: "doc",
+              label: "Vector operations",
+              id: "icicle/golang-bindings/vec-ops",
+            },
+            {
+              type: "doc",
+              label: "Multi GPU Support",
+              id: "icicle/golang-bindings/multi-gpu",
+            },
+          ]
+        },
+        {
+          type: "category",
+          label: "Rust bindings",
+          link: {
+            type: `doc`,
+            id: "icicle/rust-bindings",
+          },
+          collapsed: true,
+          items: [
+            {
+              type: "category",
+              label: "MSM",
+              link: {
+                type: `doc`,
+                id: "icicle/rust-bindings/msm",
+              },
+              collapsed: true,
+              items: [
+                {
+                  type: "doc",
+                  label: "MSM pre computation",
+                  id: "icicle/rust-bindings/msm-pre-computation",
+                }
+              ]
+            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/rust-bindings/ntt",
+            },
+            {
+              type: "doc",
+              label: "EC-NTT",
+              id: "icicle/rust-bindings/ecntt",
+            },
+            {
+              type: "doc",
+              label: "Vector operations",
+              id: "icicle/rust-bindings/vec-ops",
+            },
+            {
+              type: "doc",
+              label: "Multi GPU Support",
+              id: "icicle/rust-bindings/multi-gpu",
+            },
+            {
+              type: "doc",
+              label: "Polynomials",
+              id: "icicle/rust-bindings/polynomials",
+            },
+          ],
        },
        {
          type: "doc",
          label: "Google Colab Instructions",
          id: "icicle/colab-instructions",
        },
+        {
+          type: "doc",
+          label: "ICICLE Provers",
+          id: "icicle/integrations"
+        },
      ]
    },
    {
@@ -143,6 +190,7 @@ module.exports = {
      type: "category",
      label: "Additional Resources",
      collapsed: false,
+      collapsible: false,
      items: [
        {
          type: "link",
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -8,18 +8,16 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(icicle LANGUAGES CUDA CXX)
+project(example LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
+
 add_executable(
  example
  example.cu
 )
-
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -3,7 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -2,11 +2,8 @@
 #include <iostream>
 #include <iomanip>

-#define G2_DEFINED
-#define CURVE_ID 1
-// include MSM template
-#include "appUtils/msm/msm.cu"
-using namespace curve_config;
+#include "api/bn254.h"
+using namespace bn254;

 int main(int argc, char* argv[])
 {
@@ -24,11 +21,10 @@ int main(int argc, char* argv[])
  scalar_t* scalars = new scalar_t[N];
  affine_t* points = new affine_t[N];
  projective_t result;
-  scalar_t::RandHostMany(scalars, N);
-  projective_t::RandHostManyAffine(points, N);
+  scalar_t::rand_host_many(scalars, N);
+  projective_t::rand_host_many_affine(points, N);

  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
-  // auto config = msm::DefaultMSMConfig();
  device_context::DeviceContext ctx = device_context::get_default_device_context();
  msm::MSMConfig config = {
    ctx,   // ctx
@@ -49,28 +45,9 @@ int main(int argc, char* argv[])
  config.batch_size = batch_size;
  
  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
-  // Create two events to time the MSM kernel
  cudaStream_t stream = config.ctx.stream;
-  cudaEvent_t start, stop;
-  float time;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  // Record the start event on the stream
-  cudaEventRecord(start, stream);
  // Execute the MSM kernel
-  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, msm_size, config, &result);
-  // Record the stop event on the stream
-  cudaEventRecord(stop, stream);
-  // Wait for the stop event to complete
-  cudaEventSynchronize(stop);
-  // Calculate the elapsed time between the start and stop events
-  cudaEventElapsedTime(&time, start, stop);
-  // Destroy the events
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  // Print the elapsed time
-  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
-  // Print the result
+  bn254_msm_cuda(scalars, points, msm_size, config, &result);
  std::cout << projective_t::to_affine(result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
@@ -89,24 +66,9 @@ int main(int argc, char* argv[])
  config.are_points_on_device = true;

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
-  // Create two events to time the MSM kernel
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  // Record the start event on the stream
-  cudaEventRecord(start, stream);
  // Execute the MSM kernel
-  msm::MSM<scalar_t, affine_t, projective_t>(scalars_d, points_d, msm_size, config, result_d);
-  // Record the stop event on the stream
-  cudaEventRecord(stop, stream);
-  // Wait for the stop event to complete
-  cudaEventSynchronize(stop);
-  // Calculate the elapsed time between the start and stop events
-  cudaEventElapsedTime(&time, start, stop);
-  // Destroy the events
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  // Print the elapsed time
-  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);
+
  // Copy the result back to the host
  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
  // Print the result
@@ -123,23 +85,14 @@ int main(int argc, char* argv[])
  std::cout << "Generating random inputs on-host" << std::endl;
  // use the same scalars
  g2_affine_t* g2_points = new g2_affine_t[N];
-  g2_projective_t::RandHostManyAffine(g2_points, N);
+  g2_projective_t::rand_host_many_affine(g2_points, N);

  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
  config.are_results_on_device = false;
  config.are_scalars_on_device = false;
  config.are_points_on_device = false;
  g2_projective_t g2_result;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, stream);
-  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, g2_points, msm_size, config, &g2_result);
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
-  cudaEventElapsedTime(&time, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
@@ -157,16 +110,7 @@ int main(int argc, char* argv[])
  config.are_points_on_device = true;

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, stream);
-  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars_d, g2_points_d, msm_size, config, g2_result_d);
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
-  cudaEventElapsedTime(&time, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

--- a/examples/c++/msm/run.sh
+++ b/examples/c++/msm/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -14,11 +14,13 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
-include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -3,7 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -1,16 +1,13 @@
 #include <iostream>
 #include <thread>
 #include <chrono>
-
 #include <nvml.h>

-// select the curve
-#define CURVE_ID 2
-#include "appUtils/poseidon/poseidon.cu"
-#include "utils/error_handler.cuh"
+#include "api/bn254.h"
+#include "gpu-utils/error_handler.cuh"

 using namespace poseidon;
-using namespace curve_config;
+using namespace bn254;

 void checkCudaError(cudaError_t error) {
    if (error != cudaSuccess) {
@@ -39,7 +36,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition,
        false, // loop_state
        false, // is_async
        };
-    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
+    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
    checkCudaError(err);
 }

@@ -109,13 +106,13 @@ int main() {
    CHECK_ALLOC(column_hash1);

    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
    if (err_result != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
        return; 
    }
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
+    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);

    std::cout << "Parallel execution of Poseidon threads" << std::endl;
    START_TIMER(parallel);
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ b/examples/c++/multi-gpu-poseidon/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -8,17 +8,17 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(icicle LANGUAGES CUDA CXX)
+project(example LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -3,7 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -3,22 +3,21 @@
 #include <chrono>
 #include <nvml.h>

-#define CURVE_ID 1
-#include "curves/curve_config.cuh"
-#include "utils/device_context.cuh"
-#include "utils/vec_ops.cu"
+#include "api/bn254.h"
+#include "vec_ops/vec_ops.cuh"

-using namespace curve_config;
+using namespace vec_ops;
+using namespace bn254;

 typedef scalar_t T;

 int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
-  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
+  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
  config.is_a_on_device = true;
  config.is_b_on_device = true;
  config.is_result_on_device = true;
-  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -63,8 +62,8 @@ int main(int argc, char** argv)
  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
  std::cout << "Initializing vectors with random data" << std::endl;
-  T::RandHostMany(host_in1, vector_size);
-  T::RandHostMany(host_in2, vector_size);
+  T::rand_host_many(host_in1, vector_size);
+  T::rand_host_many(host_in2, vector_size);
  // device data
  device_context::DeviceContext ctx = device_context::get_default_device_context();
  T* device_in1;
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -8,19 +8,16 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(icicle LANGUAGES CUDA CXX)
+project(example LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
+
 add_executable(
  example
  example.cu
 )
-
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
@@ -3,9 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle

+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle

+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -1,12 +1,11 @@
 #include <chrono>
 #include <iostream>

-// select the curve
-#define CURVE_ID 1
 // include NTT template
-#include "appUtils/ntt/ntt.cu"
-#include "appUtils/ntt/kernel_ntt.cu"
-using namespace curve_config;
+
+#include "curves/params/bn254.cuh"
+#include "api/bn254.h"
+using namespace bn254;
 using namespace ntt;

 // Operate on scalars
@@ -86,14 +85,14 @@ int main(int argc, char* argv[])
  std::cout << "Running NTT with on-host data" << std::endl;
  // Create a device context
  auto ctx = device_context::get_default_device_context();
-  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
-  InitDomain(basic_root, ctx);
+  S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
+  bn254_initialize_domain(&basic_root, ctx, true);
  // Create an NTTConfig instance
-  NTTConfig<S> config = DefaultNTTConfig<S>();
+  NTTConfig<S> config = default_ntt_config<S>();
  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
  config.batch_size = nof_ntts;
  START_TIMER(MixedRadix);
-  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(MixedRadix, "MixedRadix NTT");
  
  std::cout << "Validating output" << std::endl;
@@ -101,7 +100,7 @@ int main(int argc, char* argv[])

  config.ntt_algorithm = NttAlgorithm::Radix2; 
  START_TIMER(Radix2);
-  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(Radix2, "Radix2 NTT");

  std::cout << "Validating output" << std::endl;
--- a/examples/c++/ntt/run.sh
+++ b/examples/c++/ntt/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/c++/pedersen-commitment/CMakeLists.txt
+++ b/examples/c++/pedersen-commitment/CMakeLists.txt
@@ -8,18 +8,19 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(icicle LANGUAGES CUDA CXX)
+project(example LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
+
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
--- a/examples/c++/pedersen-commitment/compile.sh
+++ b/examples/c++/pedersen-commitment/compile.sh
@@ -3,7 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -4,9 +4,9 @@
 #include <cassert>
 #include <nvml.h>

-#define CURVE_ID BN254
-#include "appUtils/msm/msm.cu"
-using namespace curve_config;
+#include "api/bn254.h"
+#include "msm/msm.cuh"
+using namespace bn254;

 typedef point_field_t T;

@@ -88,7 +88,7 @@ void point_near_x(T x, affine_t *point) {
 }

 static int seed = 0;
-static HOST_INLINE T rand_host_seed()
+static T rand_host_seed()
  {
    std::mt19937_64 generator(seed++);
    std::uniform_int_distribution<unsigned> distribution;
@@ -138,15 +138,15 @@ int main(int argc, char** argv)
  std::cout << "Generating commitment vector" << std::endl;
  projective_t result;
  scalar_t* scalars = new scalar_t[N+1];
-  scalar_t::RandHostMany(scalars, N);
+  scalar_t::rand_host_many(scalars, N);

  std::cout << "Generating salt" << std::endl;
  scalars[N] = scalar_t::rand_host();

  std::cout << "Executing MSM" << std::endl;
-  auto config = msm::DefaultMSMConfig<scalar_t>();
+  auto config = msm::default_msm_config();
  START_TIMER(msm);
-  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, N+1, config, &result);
+  bn254_msm_cuda(scalars, points, N+1, config, &result);
  END_TIMER(msm, "Time to execute MSM");

  std::cout << "Computed commitment: " << result << std::endl;
--- a/examples/c++/pedersen-commitment/run.sh
+++ b/examples/c++/pedersen-commitment/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/c++/polynomial-api/CMakeLists.txt
+++ b/examples/c++/polynomial-api/CMakeLists.txt
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -DCURVE_ID=BN254")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(example PRIVATE "../../../icicle/include")
+
+# can link to another curve/field by changing the following lib and FIELD_ID
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+target_compile_definitions(example PUBLIC FIELD_ID BN254)
--- a/examples/c++/polynomial-api/README.md
+++ b/examples/c++/polynomial-api/README.md
@@ -0,0 +1,49 @@
+# ICICLE examples: computations with polynomials
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+Polynomials are crucial for Zero-Knowledge Proofs (ZKPs): they enable efficient representation and verification of computational statements, facilitate privacy-preserving protocols, and support complex mathematical operations essential for constructing and verifying proofs without revealing underlying data. Polynomial API is documented [here](https://dev.ingonyama.com/icicle/polynomials/overview)
+
+## Running the example
+
+To run example, from project root directory:
+
+```sh
+cd examples/c++/polynomial-api
+./compile.sh
+./run.sh
+```
+
+To change the scalar field, modify `compile.h` to build the corresponding lib and `CMakeLists.txt` to link to that lib and set `FIELD_ID` correspondingly.
+
+## What's in the examples
+
+- `example_evaluate`: Make polynomial from coefficients and evalue it at random point.
+
+- `example_clone`: Make a separate copy of a polynomial.
+
+- `example_from_rou`: Reconstruct polynomial from values at the roots of unity. This operation is a cornerstone in the efficient implementation of zero-knowledge proofs, particularly in the areas of proof construction, verification, and polynomial arithmetic. By leveraging the algebraic structure and computational properties of roots of unity, ZKP protocols can achieve the scalability, efficiency, and privacy necessary for practical applications in blockchain, secure computation, and beyond.
+
+- `example_addition`, `example_addition_inplace`: Different flavors of polynomial addition.
+
+- `example_multiplication`: A product of two polynimials
+
+- `example_multiplicationScalar`: A product of scalar and a polynomial.
+
+- `example_monomials`: Add/subtract a monomial to a polynom. Monomial is a single term, which is the product of a constant coefficient and a variable raised to a non-negative integer power.
+
+- `example_ReadCoeffsToHost`: Download coefficients of a polynomial to a host. `ICICLE` keeps all polynomials on GPU, for on-host operation one needs such an operation.
+
+- `example_divisionSmall`, `example_divisionLarge`: Different flavors of division.
+
+- `example_divideByVanishingPolynomial`: A vanishing polynomial over a set S is a polynomial that evaluates to zero for every element in S. For a simple case, consider the set S={a}, a single element. The polynomial f(x)=x−a vanishes over S because f(a)=0. Mathematically, dividing a polynomial P(x) by a vanishing polynomial V(x) typically involves finding another polynomial Q(x) and possibly a remainder R(x) such that P(x)=Q(x)V(x)+R(x), where R(x) has a lower degree than V(x). In many cryptographic applications, the focus is on ensuring that P(x) is exactly divisible by V(x), meaning R(x)=0.
+
+- `example_EvenOdd`: even (odd) methods keep even (odd) coefficients of the original polynomial. For $f(x) = 1+2x+3x^2+4x^3$, even polynomial is $1+3x$, odd polynomial is $2+4x$.
+
+- `example_Slice`: extends even/odd methods and keeps coefficients for a given offset and stride. For $f(x) = 1+2x+3x^2+4x^3$, origin 0 stride 3 slice gives $1+4x$
+
+- `example_DeviceMemoryView`: device-memory views of polynomials allow "pass" polynomials to other GPU functions. In this example the coefficients of a polynomial are committed to a Merkle tree bypassing the host.
--- a/examples/c++/polynomial-api/compile.sh
+++ b/examples/c++/polynomial-api/compile.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/polynomial-api/example.cu
+++ b/examples/c++/polynomial-api/example.cu
@@ -0,0 +1,333 @@
+#include <iostream>
+
+#include "polynomials/polynomials.h"
+#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+#include "ntt/ntt.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+// using namespace field_config;
+using namespace polynomials;
+using namespace merkle;
+
+// define the polynomial type
+typedef Polynomial<scalar_t> Polynomial_t;
+
+// we'll use the following constants in the examples
+const auto zero = scalar_t::zero();
+const auto one = scalar_t::one();
+const auto two = scalar_t::from(2);
+const auto three = scalar_t::from(3);
+const auto four = scalar_t::from(4);
+const auto five = scalar_t::from(5);
+const auto minus_one = zero - one;
+
+void example_evaluate()
+{
+  std::cout << std::endl << "Example: Polynomial evaluation on random value" << std::endl;
+  const scalar_t coeffs[3] = {one, two, three};
+  auto f = Polynomial_t::from_coefficients(coeffs, 3);
+  std::cout << "f = " << f << std::endl;
+  scalar_t x = scalar_t::rand_host();
+  std::cout << "x = " << x << std::endl;
+  auto fx = f(x);
+  std::cout << "f(x) = " << fx << std::endl;
+}
+
+void example_from_rou(const int size)
+{
+  std::cout << std::endl << "Example: Reconstruct polynomial from values at roots of unity" << std::endl;
+  const int log_size = (int)ceil(log2(size));
+  const int nof_evals = 1 << log_size;
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++)
+    coeff[i] = scalar_t::rand_host();
+  auto f = Polynomial_t::from_coefficients(coeff.get(), size);
+  // rou: root of unity
+  auto omega = scalar_t::omega(log_size);
+  scalar_t evals[nof_evals] = {scalar_t::zero()};
+  auto x = scalar_t::one();
+  for (int i = 0; i < nof_evals; ++i) {
+    evals[i] = f(x);
+    x = x * omega;
+  }
+  // reconstruct f from evaluations
+  auto fr = Polynomial_t::from_rou_evaluations(evals, nof_evals);
+  // check for equality f-fr==0
+  auto h = f - fr;
+  std::cout << "degree of f - fr = " << h.degree() << std::endl;
+}
+
+static Polynomial_t randomize_polynomial(uint32_t size)
+{
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++)
+    coeff[i] = scalar_t::rand_host();
+  return Polynomial_t::from_coefficients(coeff.get(), size);
+}
+
+static Polynomial_t incremental_values(uint32_t size)
+{
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++) {
+    coeff[i] = i ? coeff[i - 1] + scalar_t::one() : scalar_t::one();
+  }
+  return Polynomial_t::from_coefficients(coeff.get(), size);
+}
+
+static bool is_equal(Polynomial_t& lhs, Polynomial_t& rhs)
+{
+  const int deg_lhs = lhs.degree();
+  const int deg_rhs = rhs.degree();
+  if (deg_lhs != deg_rhs) { return false; }
+  auto lhs_coeffs = std::make_unique<scalar_t[]>(deg_lhs);
+  auto rhs_coeffs = std::make_unique<scalar_t[]>(deg_rhs);
+  lhs.copy_coeffs(lhs_coeffs.get(), 1, deg_lhs - 1);
+  rhs.copy_coeffs(rhs_coeffs.get(), 1, deg_rhs - 1);
+  return memcmp(lhs_coeffs.get(), rhs_coeffs.get(), deg_lhs * sizeof(scalar_t)) == 0;
+}
+
+void example_addition(const int size0, const int size1)
+{
+  std::cout << std::endl << "Example: Polynomial addition" << std::endl;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+  auto x = scalar_t::rand_host();
+  auto f_x = f(x);
+  auto g_x = g(x);
+  auto fx_plus_gx = f_x + g_x;
+  auto h = f + g;
+  auto h_x = h(x);
+  std::cout << "evaluate and add: " << fx_plus_gx << std::endl;
+  std::cout << "add and evaluate: " << h_x << std::endl;
+}
+
+void example_addition_inplace(const int size0, const int size1)
+{
+  std::cout << std::endl << "Example: Polynomial inplace addition" << std::endl;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+
+  auto x = scalar_t::rand_host();
+  auto f_x = f(x);
+  auto g_x = g(x);
+  auto fx_plus_gx = f_x + g_x;
+  f += g;
+  auto s_x = f(x);
+  std::cout << "evaluate and add: " << fx_plus_gx << std::endl;
+  std::cout << "add and evaluate: " << s_x << std::endl;
+}
+
+void example_multiplication(const int log0, const int log1)
+{
+  std::cout << std::endl << "Example: Polynomial multiplication" << std::endl;
+  const int size0 = 1 << log0, size1 = 1 << log1;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+  scalar_t x = scalar_t::rand_host();
+  auto fx = f(x);
+  auto gx = g(x);
+  auto fx_mul_gx = fx * gx;
+  auto m = f * g;
+  auto mx = m(x);
+  std::cout << "evaluate and multiply: " << fx_mul_gx << std::endl;
+  std::cout << "multiply and evaluate: " << mx << std::endl;
+}
+
+void example_multiplicationScalar(const int log0)
+{
+  std::cout << std::endl << "Example: Scalar by Polynomial multiplication" << std::endl;
+  const int size = 1 << log0;
+  auto f = randomize_polynomial(size);
+  auto s = scalar_t::from(2);
+  auto g = s * f;
+  auto x = scalar_t::rand_host();
+  auto fx = f(x);
+  auto fx2 = s * fx;
+  auto gx = g(x);
+  std::cout << "Compare (2*f)(x) and 2*f(x): " << std::endl;
+  std::cout << gx << std::endl;
+  std::cout << fx2 << std::endl;
+}
+
+void example_monomials()
+{
+  std::cout << std::endl << "Example: Monomials" << std::endl;
+  const scalar_t coeffs[3] = {one, zero, two}; // 1+2x^2
+  auto f = Polynomial_t::from_coefficients(coeffs, 3);
+  const auto x = three;
+  auto fx = f(x);
+  f.add_monomial_inplace(three, 1); // add 3x
+  const auto expected_addmonmon_f_x = fx + three * x;
+  const auto addmonom_f_x = f(x);
+  std::cout << "Computed f'(x) = " << addmonom_f_x << std::endl;
+  std::cout << "Expected f'(x) = " << expected_addmonmon_f_x << std::endl;
+}
+
+void example_ReadCoeffsToHost()
+{
+  std::cout << std::endl << "Example: Read coefficients to host" << std::endl;
+  const scalar_t coeffs_f[3] = {zero, one, two}; // 0+1x+2x^2
+  auto f = Polynomial_t::from_coefficients(coeffs_f, 3);
+  const scalar_t coeffs_g[3] = {one, one, one}; // 1+x+x^2
+  auto g = Polynomial_t::from_coefficients(coeffs_g, 3);
+  auto h = f + g; // 1+2x+3x^3
+  std::cout << "Get one coefficient of h() at a time: " << std::endl;
+  const auto h0 = h.get_coeff(0);
+  const auto h1 = h.get_coeff(1);
+  const auto h2 = h.get_coeff(2);
+  std::cout << "Coefficients of h: " << std::endl;
+  std::cout << "0:" << h0 << " expected: " << one << std::endl;
+  std::cout << "1:" << h1 << " expected: " << two << std::endl;
+  std::cout << "2:" << h2 << " expected: " << three << std::endl;
+  std::cout << "Get all coefficients of h() at a time: " << std::endl;
+
+  scalar_t h_coeffs[3] = {0};
+  // fetch the coefficients for a given range
+  auto nof_coeffs = h.copy_coeffs(h_coeffs, 0, 2);
+  scalar_t expected_h_coeffs[nof_coeffs] = {one, two, three};
+  for (int i = 0; i < nof_coeffs; ++i) {
+    std::cout << i << ":" << h_coeffs[i] << " expected: " << expected_h_coeffs[i] << std::endl;
+  }
+}
+
+void example_divisionSmall()
+{
+  std::cout << std::endl << "Example: Polynomial division (small)" << std::endl;
+  const scalar_t coeffs_a[4] = {five, zero, four, three}; // 3x^3+4x^2+5
+  const scalar_t coeffs_b[3] = {minus_one, zero, one};    // x^2-1
+  auto a = Polynomial_t::from_coefficients(coeffs_a, 4);
+  auto b = Polynomial_t::from_coefficients(coeffs_b, 3);
+  auto [q, r] = a.divide(b);
+  scalar_t q_coeffs[2] = {0}; // 3x+4
+  scalar_t r_coeffs[2] = {0}; // 3x+9
+  const auto q_nof_coeffs = q.copy_coeffs(q_coeffs, 0, 1);
+  const auto r_nof_coeffs = r.copy_coeffs(r_coeffs, 0, 1);
+  std::cout << "Quotient: 0:" << q_coeffs[0] << " expected: " << scalar_t::from(4) << std::endl;
+  std::cout << "Quotient: 1:" << q_coeffs[1] << " expected: " << scalar_t::from(3) << std::endl;
+  std::cout << "Reminder: 0:" << r_coeffs[0] << " expected: " << scalar_t::from(9) << std::endl;
+  std::cout << "Reminder: 1:" << r_coeffs[1] << " expected: " << scalar_t::from(3) << std::endl;
+}
+
+void example_divisionLarge(const int log0, const int log1)
+{
+  std::cout << std::endl << "Example: Polynomial division (large)" << std::endl;
+  const int size0 = 1 << log0, size1 = 1 << log1;
+  auto a = randomize_polynomial(size0);
+  auto b = randomize_polynomial(size1);
+  auto [q, r] = a.divide(b);
+  scalar_t x = scalar_t::rand_host();
+  auto ax = a(x);
+  auto bx = b(x);
+  auto qx = q(x);
+  auto rx = r(x);
+  // check if a(x) == b(x)*q(x)+r(x)
+  std::cout << "a(x) == b(x)*q(x)+r(x)" << std::endl;
+  std::cout << "lhs = " << ax << std::endl;
+  std::cout << "rhs = " << bx * qx + rx << std::endl;
+}
+
+void example_divideByVanishingPolynomial()
+{
+  std::cout << std::endl << "Example: Polynomial division by vanishing polynomial" << std::endl;
+  const scalar_t coeffs_v[5] = {minus_one, zero, zero, zero, one}; // x^4-1 vanishes on 4th roots of unity
+  auto v = Polynomial_t::from_coefficients(coeffs_v, 5);
+  auto h = incremental_values(1 << 11);
+  auto hv = h * v;
+  auto [h_div, R] = hv.divide(v);
+  std::cout << "h_div == h: " << is_equal(h_div, h) << std::endl;
+  auto h_div_by_vanishing = hv.divide_by_vanishing_polynomial(4);
+  std::cout << "h_div_by_vanishing == h: " << is_equal(h_div_by_vanishing, h) << std::endl;
+}
+
+void example_clone(const int log0)
+{
+  std::cout << std::endl << "Example: clone polynomial" << std::endl;
+  const int size = 1 << log0;
+  auto f = randomize_polynomial(size);
+  const auto x = scalar_t::rand_host();
+  const auto fx = f(x);
+  Polynomial_t g;
+  g = f.clone();
+  g += f;
+  auto h = g.clone();
+  std::cout << "g(x) = " << g(x) << " expected: " << two * fx << std::endl;
+  std::cout << "h(x) = " << h(x) << " expected: " << g(x) << std::endl;
+}
+
+void example_EvenOdd() {
+  std::cout << std::endl << "Example: Split into even and odd powers " << std::endl;
+  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
+  auto f = Polynomial_t::from_coefficients(coeffs, 4);
+  auto f_even = f.even();
+  auto f_odd = f.odd();
+  scalar_t even_coeffs[2] = {0};
+  scalar_t odd_coeffs[2] = {0};
+  const auto even_nof_coeffs = f_even.copy_coeffs(even_coeffs, 0, 1);
+  const auto odd_nof_coeffs = f_odd.copy_coeffs(odd_coeffs, 0, 1);
+  std::cout << "Even: 0:" << even_coeffs[0] << " expected: " << one << std::endl;
+  std::cout << "Even: 1:" << even_coeffs[1] << " expected: " << three << std::endl;
+  std::cout << "Odd: 0:" << odd_coeffs[0] << " expected: " << two << std::endl;
+  std::cout << "Odd: 1:" << odd_coeffs[1] << " expected: " << four << std::endl;
+}
+
+void example_Slice() {
+  std::cout << std::endl << "Example: Slice polynomial " << std::endl;
+  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
+  auto f = Polynomial_t::from_coefficients(coeffs, 4);
+  auto f_slice = f.slice(0/=offset/, 3/=stride/, 2*/=size/); // 1+4x
+  scalar_t slice_coeffs[2] = {0};
+  const auto slice_nof_coeffs = f_slice.copy_coeffs(slice_coeffs, 0, 1);
+  std::cout << "Slice: 0:" << slice_coeffs[0] << " expected: " << one << std::endl;
+  std::cout << "Slice: 1:" << slice_coeffs[1] << " expected: " << four << std::endl;
+} 
+
+void example_DeviceMemoryView() {
+  const int log_size = 6;
+  const int size = 1 << log_size;
+  auto f = randomize_polynomial(size);
+  auto [d_coeff, N, device_id] = f.get_coefficients_view();
+  // commit coefficients to Merkle tree
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  PoseidonConstants<scalar_t> constants;
+  init_optimized_poseidon_constants<scalar_t>(2, ctx, &constants);
+  uint32_t tree_height = log_size + 1;
+  int keep_rows = 0; // keep all rows
+  size_t digests_len = log_size - 1;
+  scalar_t* digests = static_cast<scalar_t*>(malloc(sizeof(scalar_t) * digests_len));
+  TreeBuilderConfig config = default_merkle_config();
+  config.keep_rows = keep_rows;
+  config.are_inputs_on_device = true;
+  build_merkle_tree<scalar_t, (2+1)>(d_coeff.get(), digests, tree_height, constants, config);
+  std::cout << "Merkle tree root: " << digests[0] << std::endl;
+  free(digests);
+}
+
+int main(int argc, char** argv)
+{
+  // Initialize NTT. TODO: can we hide this in the library?
+  static const int MAX_NTT_LOG_SIZE = 24;
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
+  ntt::init_domain(basic_root, ntt_config.ctx);
+
+  // Virtual factory design pattern: initializing polynomimals factory for CUDA backend
+  Polynomial_t::initialize(std::make_unique<CUDAPolynomialFactory<>>());
+
+  example_evaluate();
+  example_clone(10);
+  example_from_rou(100);
+  example_addition(12, 17);
+  example_addition_inplace(2, 2);
+  example_multiplication(15, 12);
+  example_multiplicationScalar(15);
+  example_monomials();
+  example_ReadCoeffsToHost();
+  example_divisionSmall();
+  example_divisionLarge(12, 2);
+  example_divideByVanishingPolynomial();
+  example_EvenOdd();
+  example_Slice();
+  example_DeviceMemoryView();
+
+  return 0;
+}
--- a/examples/c++/polynomial-api/run.sh
+++ b/examples/c++/polynomial-api/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/c++/polynomial_multiplication/CMakeLists.txt
+++ b/examples/c++/polynomial_multiplication/CMakeLists.txt
@@ -8,7 +8,7 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(icicle LANGUAGES CUDA CXX)
+project(example LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
@@ -20,7 +20,8 @@ add_executable(
  example.cu
 )

-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/polynomial_multiplication/compile.sh
+++ b/examples/c++/polynomial_multiplication/compile.sh
@@ -3,9 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle

+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle

+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -1,18 +1,14 @@
-#define CURVE_ID BLS12_381
-
 #include <chrono>
 #include <iostream>
 #include <vector>
-
-#include "curves/curve_config.cuh"
-#include "appUtils/ntt/ntt.cu"
-#include "appUtils/ntt/kernel_ntt.cu"
-#include "utils/vec_ops.cu"
-#include "utils/error_handler.cuh"
 #include <memory>

-typedef curve_config::scalar_t test_scalar;
-typedef curve_config::scalar_t test_data;
+#include "api/bn254.h"
+#include "gpu-utils/error_handler.cuh"
+
+using namespace bn254;
+typedef scalar_t test_scalar;
+typedef scalar_t test_data;

 void random_samples(test_data* res, uint32_t count)
 {
@@ -45,7 +41,7 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context

  // init domain
-  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
+  auto ntt_config = ntt::default_ntt_config<test_scalar>();
  const bool is_radix2_alg = (argc > 1) ? atoi(argv[1]) : false;
  ntt_config.ntt_algorithm = is_radix2_alg ? ntt::NttAlgorithm::Radix2 : ntt::NttAlgorithm::MixedRadix;

@@ -55,8 +51,8 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaEventCreate(&start));
  CHK_IF_RETURN(cudaEventCreate(&stop));

-  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::InitDomain(basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);
+  test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  bn254_initialize_domain(&basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);

  // (1) cpu allocation
  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
@@ -79,26 +75,25 @@ int main(int argc, char** argv)
      ntt_config.are_inputs_on_device = false;
      ntt_config.are_outputs_on_device = true;
      ntt_config.ordering = ntt::Ordering::kNM;
-      CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
-      CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
+      CHK_IF_RETURN(bn254_ntt_cuda(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
+      CHK_IF_RETURN(bn254_ntt_cuda(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));

      // (4) multiply A,B
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-      vec_ops::VecOpsConfig<test_data> config {
+      vec_ops::VecOpsConfig config{
        ntt_config.ctx,
        true,  // is_a_on_device
        true,  // is_b_on_device
        true,  // is_result_on_device
        false  // is_async
      };
-      CHK_IF_RETURN(
-        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
+      CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));

      // (5) INTT (in place)
      ntt_config.are_inputs_on_device = true;
      ntt_config.are_outputs_on_device = true;
      ntt_config.ordering = ntt::Ordering::kMN;
-      CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
+      CHK_IF_RETURN(bn254_ntt_cuda(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));

      CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
      CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
@@ -117,6 +112,7 @@ int main(int argc, char** argv)
  benchmark(false); // warmup
  benchmark(true, 20);

+  bn254_release_domain(ntt_config.ctx);
  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));

  return 0;
--- a/examples/c++/polynomial_multiplication/run.sh
+++ b/examples/c++/polynomial_multiplication/run.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
-./build/example 1 # radix2
-./build/example 0 # mixed-radix
+./build/example/example 1 # radix2
+./build/example/example 0 # mixed-radix
--- a/examples/c++/poseidon/CMakeLists.txt
+++ b/examples/c++/poseidon/CMakeLists.txt
@@ -13,13 +13,11 @@ project(icicle LANGUAGES CUDA CXX)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
+
 add_executable(
  example
  example.cu
 )
-
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/poseidon/compile.sh
+++ b/examples/c++/poseidon/compile.sh
@@ -3,7 +3,13 @@
 # Exit immediately on error
 set -e

-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -2,14 +2,12 @@
 #include <fstream>
 #include <iostream>

-// select the curve
-#define CURVE_ID 2
-// include Poseidon template
-#include "appUtils/poseidon/poseidon.cu"
+#include "api/bn254.h"
+#include "curves/params/bn254.cuh"
 using namespace poseidon;
-using namespace curve_config;
+using namespace bn254;

-device_context::DeviceContext ctx= device_context::get_default_device_context();
+device_context::DeviceContext ctx = device_context::get_default_device_context();

 // location of a tree node in the array for a given level and offset
 inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }
@@ -21,8 +19,7 @@ void build_tree(
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    poseidon_hash<scalar_t, 2+1>(
-      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, *constants, config);
+    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
  }
 }

@@ -85,7 +82,7 @@ uint32_t validate_proof(
      hashes_in[1] = level_hash;
    }
    // next level hash
-    poseidon_hash<scalar_t, 2+1>(hashes_in, hash_out, 1, *constants, config);
+    bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
    level_hash = hash_out[0];
  }
  return proof_hash[0] == level_hash;
@@ -116,14 +113,14 @@ int main(int argc, char* argv[])
  }
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
  PoseidonConstants<scalar_t> constants;
-  init_optimized_poseidon_constants<scalar_t>(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config<scalar_t>(data_arity+1); 
-  poseidon_hash<curve_config::scalar_t, data_arity+1>(data, &tree[tree_index(leaf_level, 0)], tree_width, constants, config);
+  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
+  PoseidonConfig config = default_poseidon_config(data_arity+1); 
+  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);

  std::cout << "3. Building Merkle tree" << std::endl;
  PoseidonConstants<scalar_t> tree_constants;
-  init_optimized_poseidon_constants<scalar_t>(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config<scalar_t>(tree_arity+1);
+  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
+  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
  build_tree(tree_height, tree, &tree_constants, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
--- a/examples/c++/poseidon/run.sh
+++ b/examples/c++/poseidon/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example
+./build/example/example
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -9,7 +9,7 @@ use icicle_cuda_runtime::{
 };

 use icicle_core::{
-    ntt::{self, NTT},
+    ntt::{self, initialize_domain},
    traits::{FieldImpl, GenerateRandom},
 };

@@ -60,11 +60,11 @@ fn main() {
    )
    .unwrap();
    let ctx = DeviceContext::default();
-    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+    initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();

    println!("Configuring bn254 NTT...");
    let stream = CudaStream::create().unwrap();
-    let mut cfg = ntt::NTTConfig::default();
+    let mut cfg = ntt::NTTConfig::<'_, ScalarField>::default();
    cfg.ctx
        .stream = &stream;
    cfg.is_async = true;
@@ -76,11 +76,11 @@ fn main() {
    )
    .unwrap();
    // reusing ctx from above
-    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+    initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();

    println!("Configuring bls12377 NTT...");
    let stream_bls12377 = CudaStream::create().unwrap();
-    let mut cfg_bls12377 = ntt::NTTConfig::default();
+    let mut cfg_bls12377 = ntt::NTTConfig::<'_, BLS12377ScalarField>::default();
    cfg_bls12377
        .ctx
        .stream = &stream_bls12377;
--- a/examples/rust/polynomials/Cargo.toml
+++ b/examples/rust/polynomials/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "polynomials"
+version = "1.2.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254" }
+icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
+clap = { version = "<=4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/polynomials/src/main.rs
+++ b/examples/rust/polynomials/src/main.rs
@@ -0,0 +1,101 @@
+use icicle_babybear::field::ScalarField as babybearScalar;
+use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
+use icicle_bn254::curve::ScalarField as bn254Scalar;
+use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
+
+use icicle_cuda_runtime::{
+    device_context::DeviceContext,
+    memory::{DeviceVec, HostSlice},
+};
+
+use icicle_core::{
+    ntt::{get_root_of_unity, initialize_domain},
+    polynomials::UnivariatePolynomial,
+    traits::{FieldImpl, GenerateRandom},
+};
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of NTT to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    max_ntt_log_size: u8,
+    #[arg(short, long, default_value_t = 15)]
+    poly_log_size: u8,
+}
+
+fn init(max_ntt_size: u64) {
+    // initialize NTT domain for all fields!. Polynomials ops relies on NTT.
+    let rou_bn254: bn254Scalar = get_root_of_unity(max_ntt_size);
+    let ctx = DeviceContext::default();
+    initialize_domain(rou_bn254, &ctx, false /*=fast twiddles mode*/).unwrap();
+
+    let rou_babybear: babybearScalar = get_root_of_unity(max_ntt_size);
+    initialize_domain(rou_babybear, &ctx, false /*=fast twiddles mode*/).unwrap();
+
+    // initialize the cuda backend for polynomials
+    // make sure to initialize it per field
+    PolynomialBn254::init_cuda_backend();
+    PolynomialBabyBear::init_cuda_backend();
+}
+
+fn randomize_poly<P>(size: usize, from_coeffs: bool) -> P
+where
+    P: UnivariatePolynomial,
+    P::Field: FieldImpl,
+    P::FieldConfig: GenerateRandom<P::Field>,
+{
+    let coeffs_or_evals = P::FieldConfig::generate_random(size);
+    let p = if from_coeffs {
+        P::from_coeffs(HostSlice::from_slice(&coeffs_or_evals), size)
+    } else {
+        P::from_rou_evals(HostSlice::from_slice(&coeffs_or_evals), size)
+    };
+    p
+}
+
+fn main() {
+    let args = Args::parse();
+    init(1 << args.max_ntt_log_size);
+
+    // randomize three polynomials f,g,h over bn254 scalar field
+    let poly_size = 1 << args.poly_log_size;
+    let f = randomize_poly::<PolynomialBn254>(poly_size, true /*from random coeffs*/);
+    let g = randomize_poly::<PolynomialBn254>(poly_size / 2, true /*from random coeffs*/);
+    let h = randomize_poly::<PolynomialBn254>(poly_size / 4, false /*from random evaluations on rou*/);
+
+    // randomize two polynomials over babybear field
+    let f_babybear = randomize_poly::<PolynomialBabyBear>(poly_size, true /*from random coeffs*/);
+    let g_babybear = randomize_poly::<PolynomialBabyBear>(poly_size / 2, true /*from random coeffs*/);
+
+    // Arithmetic
+    let t0 = &f + &g;
+    let t1 = &f * &h;
+    let (q, r) = t1.divide(&t0); // computes q,r for t1(x)=q(x)*t0(x)+r(x)
+
+    let _r_babybear = &f_babybear * &g_babybear;
+
+    // check degree
+    let _r_degree = r.degree();
+
+    // evaluate in single domain point
+    let five = bn254Scalar::from_u32(5);
+    let q_at_five = q.eval(&five);
+
+    // evaluate on domain. Note: domain and image can be either Host or Device slice.
+    // in this example domain in on host and evals on device.
+    let host_domain = [five, bn254Scalar::from_u32(30)];
+    let mut device_image = DeviceVec::<bn254Scalar>::cuda_malloc(host_domain.len()).unwrap();
+    t1.eval_on_domain(HostSlice::from_slice(&host_domain), &mut device_image[..]);
+
+    // slicing
+    let o = h.odd();
+    let e = h.even();
+    let fold = &e + &(&o * &q_at_five); // e(x) + o(x)*scalar
+
+    let _coeff = fold.get_coeff(2); // coeff of x^2
+}
--- a/go.mod
+++ b/go.mod
@@ -1,4 +1,4 @@
-module github.com/ingonyama-zk/icicle
+module github.com/ingonyama-zk/icicle/v2

 go 1.20

--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,152 +1,62 @@
 cmake_minimum_required(VERSION 3.18)

-# GoogleTest requires at least C++14
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-
-if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
-  message(WARNING "Note that PIC (position-independent code) is disabled.")
-else()
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-endif()
-
-# add the target cuda architectures
-# each additional architecture increases the compilation time and output file size
-if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-  find_program(_nvidia_smi "nvidia-smi")
-
-  if(_nvidia_smi)
-    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
-
-    # execute nvidia-smi -L to get a short list of GPUs available
-    exec_program(${_nvidia_smi_path} ARGS -L
-      OUTPUT_VARIABLE _nvidia_smi_out
-      RETURN_VALUE _nvidia_smi_ret)
-
-    # process the stdout of nvidia-smi
-    if(_nvidia_smi_ret EQUAL 0)
-      # convert string with newlines to list of strings
-      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-      foreach(_line ${_nvidia_smi_out})
-        if(_line MATCHES "^GPU [0-9]+:")
-          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-          # the UUID is not very useful for the user, remove it
-          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-          if(NOT _gpu_info STREQUAL "")
-            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-          endif()
-        endif()
-      endforeach()
-
-      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
-    endif()
-  endif()
-
-  # ##
-  if(DETECT_GPU_COUNT GREATER 0)
-    set(CMAKE_CUDA_ARCHITECTURES native) # do native
-  else()
-    # no GPUs found, like on Github CI runners
-    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
-  endif()
-endif()
-
 project(icicle LANGUAGES CUDA CXX)

-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-include_directories("${CMAKE_SOURCE_DIR}")
+include(cmake/Common.cmake)
+include(cmake/FieldsCommon.cmake)
+include(cmake/CurvesCommon.cmake)

+set_env()
+set_gpu_env()

-# when adding a new curve/field, append its name to the end of this list
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

-set(IS_CURVE_SUPPORTED FALSE)
-set(I 0)
-foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
-  math(EXPR I "${I} + 1")
-  if (CURVE STREQUAL SUPPORTED_CURVE)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I}")
-    set(IS_CURVE_SUPPORTED TRUE)
-  endif ()
-endforeach()
+option(DEVMODE "Enable development mode" OFF)
+option(EXT_FIELD "Build extension field" OFF)
+option(G2 "Build G2" OFF)
+option(MSM "Build MSM" ON)
+option(ECNTT "Build ECNTT" OFF)
+option(BUILD_HASH "Build hash functions" OFF)
+option(BUILD_TESTS "Build unit tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+# add options here

-if (NOT IS_CURVE_SUPPORTED)
-  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+if((DEFINED CURVE) AND (DEFINED FIELD))
+  message( FATAL_ERROR "CURVE and FIELD cannot be defined at the same time" )
 endif ()

-if (G2_DEFINED STREQUAL "ON")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
+if (DEVMODE)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O0 --ptxas-options=-O0 --ptxas-options=-allow-expensive-optimizations=false -DDEVMODE=ON")
 endif ()

-option(BUILD_TESTS "Build tests" OFF)
-
-if (NOT BUILD_TESTS)
-
-  message(STATUS "Building without tests.")
-
-  if (CURVE IN_LIST SUPPORTED_CURVES_WITH_POSEIDON)
-    list(APPEND ICICLE_SOURCES appUtils/poseidon/poseidon.cu)
-    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
-  endif()
-
-  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
-  endif()
-
-  add_library(
-    icicle
-    utils/vec_ops.cu
-    utils/mont.cu
-    primitives/field.cu
-    primitives/projective.cu
-    appUtils/msm/msm.cu
-    ${ICICLE_SOURCES}
-  )
-  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
-  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  
-
-else()
-
-  message(STATUS "Building tests.")
-
-  include(FetchContent)
-  FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
-  )
-  # For Windows: Prevent overriding the parent project's compiler/linker settings
-
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-  FetchContent_MakeAvailable(googletest)
-
-  enable_testing()
-
-  add_executable(
-    runner
-    tests/runner.cu
-  )
-
-  target_link_libraries(
-    runner
-    GTest::gtest_main
-  )
-
-  include(GoogleTest)
-  set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-  gtest_discover_tests(runner)
-
+if(DEFINED FIELD)
+  check_field()
+  add_subdirectory(src/fields)
 endif ()
+
+if(DEFINED CURVE)
+  check_curve()
+  set(FIELD ${CURVE})
+  add_subdirectory(src/fields)
+  add_subdirectory(src/curves)
+endif ()
+
+if (G2)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
+endif ()
+
+if (EXT_FIELD)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
+endif ()
+
+if(BUILD_HASH)
+  add_subdirectory(src/hash)
+endif ()
+
+if (BUILD_TESTS)
+  add_subdirectory(tests)
+endif()
+
+if (BUILD_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
@@ -1,4 +0,0 @@
-test_msm:
-	mkdir -p work
-	nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
-	work/test_msm
--- a/icicle/appUtils/ntt/Makefile
+++ b/icicle/appUtils/ntt/Makefile
@@ -1,6 +0,0 @@
-build_verification:
-	mkdir -p work
-	nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17
-
-test_verification: build_verification
-	work/test_verification
--- a/icicle/appUtils/ntt/tests/verification.cu
+++ b/icicle/appUtils/ntt/tests/verification.cu
@@ -1,199 +0,0 @@
-
-#define CURVE_ID BLS12_381
-
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include <chrono>
-#include <iostream>
-#include <vector>
-
-#include "curves/curve_config.cuh"
-#include "ntt/ntt.cu"
-#include "ntt/ntt_impl.cuh"
-#include <memory>
-
-typedef curve_config::scalar_t test_scalar;
-typedef curve_config::scalar_t test_data;
-#include "kernel_ntt.cu"
-
-void random_samples(test_data* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++)
-    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
-}
-
-void incremental_values(test_scalar* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++) {
-    res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
-  }
-}
-
-__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid >= row_size * column_size) return;
-  out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
-}
-
-int main(int argc, char** argv)
-{
-  cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
-  float icicle_time, new_time;
-
-  int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
-  int NTT_SIZE = 1 << NTT_LOG_SIZE;
-  bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
-  int INV = (argc > 3) ? atoi(argv[3]) : false;
-  int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
-  bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
-  int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
-  const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
-  bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
-
-  // Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
-  const char* ordering_str = ordering == ntt::Ordering::kNN   ? "NN"
-                             : ordering == ntt::Ordering::kNR ? "NR"
-                             : ordering == ntt::Ordering::kRN ? "RN"
-                             : ordering == ntt::Ordering::kRR ? "RR"
-                             : ordering == ntt::Ordering::kNM ? "NM"
-                                                              : "MN";
-
-  printf(
-    "running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
-    NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
-
-  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
-
-  // init domain
-  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
-  ntt_config.ordering = ordering;
-  ntt_config.are_inputs_on_device = true;
-  ntt_config.are_outputs_on_device = true;
-  ntt_config.batch_size = BATCH_SIZE;
-  ntt_config.columns_batch = COLUMNS_BATCH;
-
-  CHK_IF_RETURN(cudaEventCreate(&icicle_start));
-  CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
-  CHK_IF_RETURN(cudaEventCreate(&new_start));
-  CHK_IF_RETURN(cudaEventCreate(&new_stop));
-
-  auto start = std::chrono::high_resolution_clock::now();
-  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::InitDomain(basic_root, ntt_config.ctx, FAST_TW);
-  auto stop = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
-  std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
-
-  // cpu allocation
-  auto CpuScalars = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-  auto CpuOutputOld = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-  auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
-
-  // gpu allocation
-  test_data *GpuScalars, *GpuOutputOld, *GpuOutputNew;
-  test_data* GpuScalarsTransposed;
-  CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-
-  // init inputs
-  // incremental_values(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
-  random_samples(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
-  CHK_IF_RETURN(
-    cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
-
-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
-  }
-
-  // inplace
-  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
-  }
-
-  for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
-    ntt_config.coset_gen = ntt_config.coset_gen * basic_root;
-  }
-
-  auto benchmark = [&](bool is_print, int iterations) -> cudaError_t {
-    // NEW
-    CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
-    ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
-    for (size_t i = 0; i < iterations; i++) {
-      CHK_IF_RETURN(ntt::NTT(
-        INPLACE         ? GpuOutputNew
-        : COLUMNS_BATCH ? GpuScalarsTransposed
-                        : GpuScalars,
-        NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
-    }
-    CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
-
-    // OLD
-    CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
-    ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
-    for (size_t i = 0; i < iterations; i++) {
-      CHK_IF_RETURN(
-        ntt::NTT(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
-    }
-    CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
-    CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
-
-    if (is_print) {
-      printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
-      printf("New Runtime=%0.3f MS\n", new_time / iterations);
-    }
-
-    return CHK_LAST();
-  };
-
-  CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
-  int count = INPLACE ? 1 : 10;
-  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
-  }
-  CHK_IF_RETURN(benchmark(true /*=print*/, count));
-
-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
-  }
-
-  // verify
-  CHK_IF_RETURN(
-    cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
-  CHK_IF_RETURN(
-    cudaMemcpy(CpuOutputOld.get(), GpuOutputOld, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
-
-  bool success = true;
-  for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
-    // if (i%64==0) printf("\n");
-    if (CpuOutputNew[i] != CpuOutputOld[i]) {
-      success = false;
-      // std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
-      // break;
-    } else {
-      // std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
-      // break;
-    }
-  }
-  const char* success_str = success ? "SUCCESS!" : "FAIL!";
-  printf("%s\n", success_str);
-
-  CHK_IF_RETURN(cudaFree(GpuScalars));
-  CHK_IF_RETURN(cudaFree(GpuOutputOld));
-  CHK_IF_RETURN(cudaFree(GpuOutputNew));
-
-  return CHK_LAST();
-}
--- a/icicle/appUtils/ntt/thread_ntt.cu
+++ b/icicle/appUtils/ntt/thread_ntt.cu
@@ -1,724 +0,0 @@
-#ifndef T_NTT
-#define T_NTT
-#pragma once
-
-#include <stdio.h>
-#include <stdint.h>
-#include "curves/curve_config.cuh"
-
-struct stage_metadata {
-  uint32_t th_stride;
-  uint32_t ntt_block_size;
-  uint32_t batch_id;
-  uint32_t ntt_block_id;
-  uint32_t ntt_inp_id;
-};
-
-#define STAGE_SIZES_DATA                                                                                               \
-  {                                                                                                                    \
-    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
-  }
-uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
-__device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
-
-// construction for fast-twiddles
-uint32_t constexpr STAGE_PREV_SIZES[31] = {0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  5,  6,  6,  9,  9, 10,
-                                           11, 11, 12, 15, 15, 16, 16, 18, 18, 20, 21, 21, 22, 23, 24};
-
-#define STAGE_SIZES_DATA_FAST_TW                                                                                       \
-  {                                                                                                                    \
-    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
-      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
-      {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},            \
-      {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
-      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},            \
-      {6, 6, 6, 6, 6},                                                                                                 \
-  }
-uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
-__device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
-
-template <typename E, typename S>
-class NTTEngine
-{
-public:
-  E X[8];
-  S WB[3];
-  S WI[7];
-  S WE[8];
-
-  __device__ __forceinline__ void loadBasicTwiddles(S* basic_twiddles)
-  {
-#pragma unroll
-    for (int i = 0; i < 3; i++) {
-      WB[i] = basic_twiddles[i];
-    }
-  }
-
-  __device__ __forceinline__ void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
-  {
-#pragma unroll
-    for (int i = 0; i < 3; i++) {
-      WB[i] = basic_twiddles[inv ? i + 3 : i];
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddles64(S* data, bool stride)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddles32(S* data, bool stride)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddles16(S* data, bool stride)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp];
-    }
-  }
-
-  __device__ __forceinline__ void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
-  {
-#pragma unroll
-    for (int i = 0; i < 7; i++) {
-      uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
-      WI[i] = data[(inv && exp) ? 64 - exp : exp];
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
-  {
-    data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void loadExternalTwiddlesGeneric64(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
-                     << (tw_log_size - tw_log_order - 6);
-      WE[i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-    }
-  }
-
-  __device__ __forceinline__ void loadExternalTwiddlesGeneric32(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
-                       << (tw_log_size - tw_log_order - 5);
-        WE[4 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void loadExternalTwiddlesGeneric16(
-    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
-  {
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
-                       << (tw_log_size - tw_log_order - 4);
-        WE[2 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
-    }
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride];
-    }
-  }
-
-  __device__ __forceinline__ void loadGlobalDataColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
-    }
-  }
-
-  __device__ __forceinline__ void
-  storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
-    }
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride] = X[i];
-    }
-  }
-
-  __device__ __forceinline__ void storeGlobalDataColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
-    }
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void loadGlobalData32ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void
-  storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
-    }
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride] = X[4 * j + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void storeGlobalData32ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void
-  loadGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
-    }
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void loadGlobalData16ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void
-  storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
-  {
-    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
-    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
-    }
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride] = X[2 * j + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void storeGlobalData16ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void ntt4_2()
-  {
-#pragma unroll
-    for (int i = 0; i < 2; i++) {
-      ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
-    }
-  }
-
-  __device__ __forceinline__ void ntt2_4()
-  {
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-      ntt2(X[2 * i], X[2 * i + 1]);
-    }
-  }
-
-  __device__ __forceinline__ void ntt2(E& X0, E& X1)
-  {
-    E T;
-
-    T = X0 + X1;
-    X1 = X0 - X1;
-    X0 = T;
-  }
-
-  __device__ __forceinline__ void ntt4(E& X0, E& X1, E& X2, E& X3)
-  {
-    E T;
-
-    T = X0 + X2;
-    X2 = X0 - X2;
-    X0 = X1 + X3;
-    X1 = X1 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
-
-    X1 = X1 * WB[0];
-
-    X3 = X2 - X1;
-    X1 = X2 + X1;
-    X2 = T - X0;
-    X0 = T + X0;
-  }
-
-  // rbo version
-  __device__ __forceinline__ void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
-  {
-    E T;
-
-    T = X0 - X1;
-    X0 = X0 + X1;
-    X1 = X2 + X3;
-    X3 = X2 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
-
-    X3 = X3 * WB[0];
-
-    X2 = X0 - X1;
-    X0 = X0 + X1;
-    X1 = T + X3;
-    X3 = T - X3;
-  }
-
-  __device__ __forceinline__ void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
-  {
-    E T;
-
-    // out of 56,623,104 possible mappings, we have:
-    T = X3 - X7;
-    X7 = X3 + X7;
-    X3 = X1 - X5;
-    X5 = X1 + X5;
-    X1 = X2 + X6;
-    X2 = X2 - X6;
-    X6 = X0 + X4;
-    X0 = X0 - X4;
-
-    T = T * WB[1];
-    X2 = X2 * WB[1];
-
-    X4 = X6 + X1;
-    X6 = X6 - X1;
-    X1 = X3 + T;
-    X3 = X3 - T;
-    T = X5 + X7;
-    X5 = X5 - X7;
-    X7 = X0 + X2;
-    X0 = X0 - X2;
-
-    X1 = X1 * WB[0];
-    X5 = X5 * WB[1];
-    X3 = X3 * WB[2];
-
-    X2 = X6 + X5;
-    X6 = X6 - X5;
-    X5 = X7 - X1;
-    X1 = X7 + X1;
-    X7 = X0 - X3;
-    X3 = X0 + X3;
-    X0 = X4 + T;
-    X4 = X4 - T;
-  }
-
-  __device__ __forceinline__ void ntt8win()
-  {
-    E T;
-
-    T = X[3] - X[7];
-    X[7] = X[3] + X[7];
-    X[3] = X[1] - X[5];
-    X[5] = X[1] + X[5];
-    X[1] = X[2] + X[6];
-    X[2] = X[2] - X[6];
-    X[6] = X[0] + X[4];
-    X[0] = X[0] - X[4];
-
-    X[2] = X[2] * WB[0];
-
-    X[4] = X[6] + X[1];
-    X[6] = X[6] - X[1];
-    X[1] = X[3] + T;
-    X[3] = X[3] - T;
-    T = X[5] + X[7];
-    X[5] = X[5] - X[7];
-    X[7] = X[0] + X[2];
-    X[0] = X[0] - X[2];
-
-    X[1] = X[1] * WB[1];
-    X[5] = X[5] * WB[0];
-    X[3] = X[3] * WB[2];
-
-    X[2] = X[6] + X[5];
-    X[6] = X[6] - X[5];
-
-    X[5] = X[1] + X[3];
-    X[3] = X[1] - X[3];
-
-    X[1] = X[7] + X[5];
-    X[5] = X[7] - X[5];
-    X[7] = X[0] - X[3];
-    X[3] = X[0] + X[3];
-    X[0] = X[4] + T;
-    X[4] = X[4] - T;
-  }
-
-  __device__ __forceinline__ void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
-    uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 64 + i * 8 + column_id];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
-    uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 64 + row_id * 8 + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 32 + i * 4 + column_id];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 32 + row_id * 8 + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        if (store) {
-          shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
-        } else {
-          X[4 * j + i] = shmem[ntt_id * 32 + i * 8 + column_id + j];
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
-    uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 2; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 4; i++) {
-        if (store) {
-          shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
-        } else {
-          X[4 * j + i] = shmem[ntt_id * 32 + row_id * 4 + 4 * j + i];
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 16 + i * 2 + column_id];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
-
-#pragma unroll
-    for (uint32_t i = 0; i < 8; i++) {
-      if (store) {
-        shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
-      } else {
-        X[i] = shmem[ntt_id * 16 + row_id * 8 + i];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        if (store) {
-          shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
-        } else {
-          X[2 * j + i] = shmem[ntt_id * 16 + i * 8 + column_id + j];
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
-  {
-    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
-    uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
-
-#pragma unroll
-    for (uint32_t j = 0; j < 4; j++) {
-#pragma unroll
-      for (uint32_t i = 0; i < 2; i++) {
-        if (store) {
-          shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
-        } else {
-          X[2 * j + i] = shmem[ntt_id * 16 + row_id * 2 + 2 * j + i];
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void twiddlesInternal()
-  {
-#pragma unroll
-    for (int i = 1; i < 8; i++) {
-      X[i] = X[i] * WI[i - 1];
-    }
-  }
-
-  __device__ __forceinline__ void twiddlesExternal()
-  {
-#pragma unroll
-    for (int i = 0; i < 8; i++) {
-      X[i] = X[i] * WE[i];
-    }
-  }
-};
-
-#endif
--- a/icicle/appUtils/poseidon/kernels.cu
+++ b/icicle/appUtils/poseidon/kernels.cu
@@ -1,175 +0,0 @@
-#include "poseidon.cuh"
-
-namespace poseidon {
-  template <typename S, int T>
-  __global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
-  {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int state_number = idx / T;
-    if (state_number >= number_of_states) { return; }
-    int element_number = idx % T;
-
-    S prepared_element;
-
-    // Domain separation
-    if (element_number == 0) {
-      prepared_element = domain_tag;
-    } else {
-      if (aligned) {
-        prepared_element = states[idx];
-      } else {
-        prepared_element = states[idx - 1];
-      }
-    }
-
-    // We need __syncthreads here if the state is not aligned
-    // because then we need to shift the vector [A, B, 0] -> [D, A, B]
-    if (!aligned) { __syncthreads(); }
-
-    // Store element in state
-    states[idx] = prepared_element;
-  }
-
-  template <typename S>
-  __device__ __forceinline__ S sbox_alpha_five(S element)
-  {
-    S result = S::sqr(element);
-    result = S::sqr(result);
-    return result * element;
-  }
-
-  template <typename S, int T>
-  __device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, S* shared_states)
-  {
-    __syncthreads();
-    shared_states[threadIdx.x] = element;
-    __syncthreads();
-
-    typename S::Wide element_wide = S::mul_wide(shared_states[vec_number * T], matrix[element_number]);
-#pragma unroll
-    for (int i = 1; i < T; i++) {
-      element_wide = element_wide + S::mul_wide(shared_states[vec_number * T + i], matrix[i * T + element_number]);
-    }
-
-    return S::reduce(element_wide);
-  }
-
-  template <typename S, int T>
-  __device__ S full_round(
-    S element,
-    size_t rc_offset,
-    int local_state_number,
-    int element_number,
-    bool multiply_by_mds,
-    bool add_pre_round_constants,
-    bool skip_rc,
-    S* shared_states,
-    const PoseidonConstants<S>& constants)
-  {
-    if (add_pre_round_constants) {
-      element = element + constants.round_constants[rc_offset + element_number];
-      rc_offset += T;
-    }
-    element = sbox_alpha_five(element);
-    if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }
-
-    // Multiply all the states by mds matrix
-    S* matrix = multiply_by_mds ? constants.mds_matrix : constants.non_sparse_matrix;
-    return vecs_mul_matrix<S, T>(element, matrix, element_number, local_state_number, shared_states);
-  }
-
-  template <typename S, int T>
-  __global__ void full_rounds(
-    S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConstants<S> constants)
-  {
-    extern __shared__ S shared_states[];
-
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int state_number = idx / T;
-    if (state_number >= number_of_states) { return; }
-    int local_state_number = threadIdx.x / T;
-    int element_number = idx % T;
-
-    S new_el = states[idx];
-    bool add_pre_round_constants = first_half;
-    for (int i = 0; i < constants.full_rounds_half; i++) {
-      new_el = full_round<S, T>(
-        new_el, rc_offset, local_state_number, element_number, !first_half || (i < (constants.full_rounds_half - 1)),
-        add_pre_round_constants, !first_half && (i == constants.full_rounds_half - 1), shared_states, constants);
-      rc_offset += T;
-
-      if (add_pre_round_constants) {
-        rc_offset += T;
-        add_pre_round_constants = false;
-      }
-    }
-    states[idx] = new_el;
-  }
-
-  template <typename S, int T>
-  __device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
-  {
-    S element = state[0];
-    element = sbox_alpha_five(element);
-    element = element + constants.round_constants[rc_offset];
-
-    S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
-
-    typename S::Wide state_0_wide = S::mul_wide(element, sparse_matrix[0]);
-
-#pragma unroll
-    for (int i = 1; i < T; i++) {
-      state_0_wide = state_0_wide + S::mul_wide(state[i], sparse_matrix[i]);
-    }
-
-    state[0] = S::reduce(state_0_wide);
-
-#pragma unroll
-    for (int i = 1; i < T; i++) {
-      state[i] = state[i] + (element * sparse_matrix[T + i - 1]);
-    }
-  }
-
-  template <typename S, int T>
-  __global__ void
-  partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConstants<S> constants)
-  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    S state[T];
-#pragma unroll
-    for (int i = 0; i < T; i++) {
-      state[i] = states[idx * T + i];
-    }
-
-    for (int i = 0; i < constants.partial_rounds; i++) {
-      partial_round<S, T>(state, rc_offset, i, constants);
-      rc_offset++;
-    }
-
-#pragma unroll
-    for (int i = 0; i < T; i++) {
-      states[idx * T + i] = state[i];
-    }
-  }
-
-  // These function is just doing copy from the states to the output
-  template <typename S, int T>
-  __global__ void get_hash_results(S* states, size_t number_of_states, S* out)
-  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    out[idx] = states[idx * T + 1];
-  }
-
-  template <typename S, int T>
-  __global__ void copy_recursive(S* state, size_t number_of_states, S* out)
-  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
-  }
-} // namespace poseidon
--- a/icicle/appUtils/tree/Makefile
+++ b/icicle/appUtils/tree/Makefile
@@ -1,3 +0,0 @@
-test_merkle:
-	nvcc -o test_merkle -I. -I../.. test.cu
-	./test_merkle
--- a/icicle/benchmarks/CMakeLists.txt
+++ b/icicle/benchmarks/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_executable(benches benches.cu)
+target_link_libraries(benches benchmark::benchmark)
+target_include_directories(benches PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+find_package(benchmark REQUIRED)
--- a/icicle/benchmarks/README.md
+++ b/icicle/benchmarks/README.md
@@ -0,0 +1,25 @@
+# How to use benchmarks
+
+ICICLE uses [google benchmarks](https://github.com/google/benchmark) to measure the performance of primitives.
+
+To run benchmarks, make sure you have everything installed to run ICICLE (see top-level README for that). Next, you need to install google benchmarks library as described in their [documentation](https://github.com/google/benchmark?tab=readme-ov-file#installation). When running benchmarks, export the path to this installation:
+
+```
+export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:<path-to-google-benchmarks-build-folder>
+```
+
+Then to benchmark field arithmetic, say, on `babybear` field, run:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DFIELD=babybear -DEXT_FIELD=ON -S . -B build;
+cmake --build build;
+build/benches --benchmark_counters_tabular=true
+```
+
+`-U` parameters are needed to clear variables from previous runs and `EXT_FIELD` can be disabled if benhcmarking the extension field is not needed. To benchmark a curve, say, `bn254`, change the first `cmake` call to:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DCURVE=bn254 -S . -B build;
+```
+
+Benchmarks measure throughput of very cheap operations like field multiplication or EC addition by repeating them very many times in parallel, so throughput is the main metric to look at.
--- a/icicle/benchmarks/benches.cu
+++ b/icicle/benchmarks/benches.cu
@@ -0,0 +1,6 @@
+#include "field_benchmarks.cu"
+#ifdef CURVE_ID
+#include "curve_benchmarks.cu"
+#endif
+
+BENCHMARK_MAIN();
--- a/icicle/benchmarks/curve_benchmarks.cu
+++ b/icicle/benchmarks/curve_benchmarks.cu
@@ -0,0 +1,79 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "curves/curve_config.cuh"
+
+using namespace curve_config;
+using namespace benchmark;
+
+static void BM_MixedECAdd(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  affine_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(affine_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  affine_t* h_points2 = (affine_t*)malloc(n * sizeof(affine_t));
+  projective_t::rand_host_many(h_points1, n);
+  projective_t::rand_host_many_affine(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(affine_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, affine_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+static void BM_FullECAdd(benchmark::State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  projective_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(projective_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t* h_points2 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t::rand_host_many(h_points1, n);
+  projective_t::rand_host_many(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, projective_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+BENCHMARK(BM_FullECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_MixedECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
--- a/icicle/benchmarks/field_benchmarks.cu
+++ b/icicle/benchmarks/field_benchmarks.cu
@@ -0,0 +1,108 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+using namespace benchmark;
+
+template <class T>
+static void BM_FieldAdd(State& state)
+{
+  constexpr int N = 256;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldMul(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_mul<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldSqr(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars;
+  assert(!cudaMalloc(&scalars, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((field_vec_sqr<T, N>(scalars, scalars, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars);
+}
+
+BENCHMARK(BM_FieldAdd<scalar_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+
+#ifdef EXT_FIELD
+BENCHMARK(BM_FieldAdd<extension_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+#endif
--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -0,0 +1,72 @@
+function(set_env)
+    set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+    set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+
+    if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+        message(WARNING "Note that PIC (position-independent code) is disabled.")
+    else()
+        set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    endif()
+endfunction()
+
+function(set_gpu_env)
+    # add the target cuda architectures
+    # each additional architecture increases the compilation time and output file size
+    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    else()
+    find_program(_nvidia_smi "nvidia-smi")
+
+    if(_nvidia_smi)
+        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+        # execute nvidia-smi -L to get a short list of GPUs available
+        exec_program(${_nvidia_smi_path} ARGS -L
+        OUTPUT_VARIABLE _nvidia_smi_out
+        RETURN_VALUE _nvidia_smi_ret)
+
+        # process the stdout of nvidia-smi
+        if(_nvidia_smi_ret EQUAL 0)
+        # convert string with newlines to list of strings
+        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+        foreach(_line ${_nvidia_smi_out})
+            if(_line MATCHES "^GPU [0-9]+:")
+            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+            # the UUID is not very useful for the user, remove it
+            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+            if(NOT _gpu_info STREQUAL "")
+                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+            endif()
+            endif()
+        endforeach()
+
+        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+        endif()
+    endif()
+
+    # ##
+    if(DETECT_GPU_COUNT GREATER 0)
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
+    else()
+        # no GPUs found, like on Github CI runners
+        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
+    endif()
+    endif()
+
+    # Check CUDA version and, if possible, enable multi-threaded compilation 
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
+        message(STATUS "Using multi-threaded CUDA compilation.")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
+    else()
+        message(STATUS "Can't use multi-threaded CUDA compilation.")
+    endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
+endfunction()
--- a/icicle/cmake/CurvesCommon.cmake
+++ b/icicle/cmake/CurvesCommon.cmake
@@ -0,0 +1,17 @@
+function(check_curve)
+  set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+
+  set(IS_CURVE_SUPPORTED FALSE)
+  set(I 0)
+  foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
+    math(EXPR I "${I} + 1")
+    if (CURVE STREQUAL SUPPORTED_CURVE)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_CURVE_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_CURVE_SUPPORTED)
+    message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+  endif ()
+endfunction()
--- a/icicle/cmake/FieldsCommon.cmake
+++ b/icicle/cmake/FieldsCommon.cmake
@@ -0,0 +1,17 @@
+function(check_field)
+  set(SUPPORTED_FIELDS babybear;stark252)
+
+  set(IS_FIELD_SUPPORTED FALSE)
+  set(I 1000)
+  foreach (SUPPORTED_FIELD ${SUPPORTED_FIELDS})
+    math(EXPR I "${I} + 1")
+    if (FIELD STREQUAL SUPPORTED_FIELD)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_FIELD_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_FIELD_SUPPORTED)
+    message( FATAL_ERROR "The value of FIELD variable: ${FIELD} is not one of the supported fields: ${SUPPORTED_FIELDS}" )
+  endif ()
+endfunction()
--- a/icicle/curves/curve_config.cuh
+++ b/icicle/curves/curve_config.cuh
@@ -1,94 +0,0 @@
-#pragma once
-#ifndef INDEX_H
-#define INDEX_H
-
-#define BN254     1
-#define BLS12_381 2
-#define BLS12_377 3
-#define BW6_761   4
-#define GRUMPKIN  5
-
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#if defined(G2_DEFINED)
-#include "primitives/extension_field.cuh"
-#endif
-
-#if CURVE_ID == BN254
-#include "bn254_params.cuh"
-using namespace bn254;
-#elif CURVE_ID == BLS12_381
-#include "bls12_381_params.cuh"
-using namespace bls12_381;
-#elif CURVE_ID == BLS12_377
-#include "bls12_377_params.cuh"
-using namespace bls12_377;
-#elif CURVE_ID == BW6_761
-#include "bls12_377_params.cuh"
-#include "bw6_761_params.cuh"
-using namespace bw6_761;
-#elif CURVE_ID == GRUMPKIN
-#include "grumpkin_params.cuh"
-using namespace grumpkin;
-#endif
-
-/**
- * @namespace curve_config
- * Namespace with type definitions for short Weierstrass pairing-friendly [elliptic
- * curves](https://hyperelliptic.org/EFD/g1p/auto-shortw.html). Here, concrete types are created in accordance
- * with the `-DCURVE` env variable passed during build.
- */
-namespace curve_config {
-
-#if CURVE_ID == BW6_761
-  typedef bls12_377::fq_config fp_config;
-#endif
-  /**
-   * Scalar field of the curve. Is always a prime field.
-   */
-  typedef Field<fp_config> scalar_t;
-  /**
-   * Base field of G1 curve. Is always a prime field.
-   */
-  typedef Field<fq_config> point_field_t;
-  static constexpr point_field_t generator_x = point_field_t{g1_gen_x};
-  static constexpr point_field_t generator_y = point_field_t{g1_gen_y};
-  static constexpr point_field_t b = point_field_t{weierstrass_b};
-  /**
-   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html)
-   * of G1 curve consisting of three coordinates of type [point_field_t](point_field_t).
-   */
-  typedef Projective<point_field_t, scalar_t, b, generator_x, generator_y> projective_t;
-  /**
-   * Affine representation of G1 curve consisting of two coordinates of type [point_field_t](point_field_t).
-   */
-  typedef Affine<point_field_t> affine_t;
-
-#if defined(G2_DEFINED)
-#if CURVE_ID == BW6_761
-  typedef point_field_t g2_point_field_t;
-  static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{g2_gen_x};
-  static constexpr g2_point_field_t g2_generator_y = g2_point_field_t{g2_gen_y};
-  static constexpr g2_point_field_t g2_b = g2_point_field_t{g2_weierstrass_b};
-#else
-  typedef ExtensionField<fq_config> g2_point_field_t;
-  static constexpr g2_point_field_t g2_generator_x =
-    g2_point_field_t{point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}};
-  static constexpr g2_point_field_t g2_generator_y =
-    g2_point_field_t{point_field_t{g2_gen_y_re}, point_field_t{g2_gen_y_im}};
-  static constexpr g2_point_field_t g2_b =
-    g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
-#endif
-  /**
-   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) of G2 curve.
-   */
-  typedef Projective<g2_point_field_t, scalar_t, g2_b, g2_generator_x, g2_generator_y> g2_projective_t;
-  /**
-   * Affine representation of G1 curve.
-   */
-  typedef Affine<g2_point_field_t> g2_affine_t;
-#endif
-
-} // namespace curve_config
-
-#endif
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -0,0 +1,73 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef BABYBEAR_API_H
+#define BABYBEAR_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "fields/stark_fields/babybear.cuh"
+#include "ntt/ntt.cuh"
+#include "vec_ops/vec_ops.cuh"
+
+extern "C" cudaError_t babybear_extension_ntt_cuda(
+  const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);
+
+extern "C" cudaError_t babybear_mul_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
+
+extern "C" cudaError_t babybear_add_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
+
+extern "C" cudaError_t babybear_sub_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
+
+extern "C" cudaError_t babybear_transpose_matrix_cuda(
+  const babybear::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  babybear::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
+
+extern "C" cudaError_t babybear_scalar_convert_montgomery(
+  babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t babybear_initialize_domain(
+  babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t babybear_ntt_cuda(
+  const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::scalar_t* output);
+
+extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx);
+
+extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
+
+extern "C" cudaError_t babybear_extension_scalar_convert_montgomery(
+  babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t babybear_extension_mul_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_add_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_sub_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
+  const babybear::extension_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  babybear::extension_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+#endif
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -0,0 +1,132 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef BLS12_377_API_H
+#define BLS12_377_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "curves/params/bls12_377.cuh"
+#include "ntt/ntt.cuh"
+#include "msm/msm.cuh"
+#include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
+  bls12_377::g2_affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bls12_377::g2_affine_t* output_bases);
+
+extern "C" cudaError_t bls12_377_g2_msm_cuda(
+  const bls12_377::scalar_t* scalars, const bls12_377::g2_affine_t* points, int msm_size, msm::MSMConfig& config, bls12_377::g2_projective_t* out);
+
+extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
+  bls12_377::affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bls12_377::affine_t* output_bases);
+
+extern "C" cudaError_t bls12_377_msm_cuda(
+  const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_377::projective_t* out);
+
+extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
+
+extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
+
+extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
+
+extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery(
+  bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery(
+  bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_ecntt_cuda(
+  const bls12_377::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::projective_t* output);
+
+extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2);
+
+extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out);
+
+extern "C" void bls12_377_generate_projective_points(bls12_377::projective_t* points, int size);
+
+extern "C" void bls12_377_generate_affine_points(bls12_377::affine_t* points, int size);
+
+extern "C" cudaError_t bls12_377_affine_convert_montgomery(
+  bls12_377::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_projective_convert_montgomery(
+  bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(
+  int arity,
+  int full_rounds_half,
+  int partial_rounds,
+  const bls12_377::scalar_t* constants,
+  device_context::DeviceContext& ctx,
+  poseidon::PoseidonConstants<bls12_377::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(
+  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_377::scalar_t>* constants);
+
+extern "C" cudaError_t bls12_377_poseidon_hash_cuda(
+  bls12_377::scalar_t* input,
+  bls12_377::scalar_t* output,
+  int number_of_states,
+  int arity,
+  const poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
+  poseidon::PoseidonConfig& config);
+
+extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree(
+  const bls12_377::scalar_t* leaves,
+  bls12_377::scalar_t* digests,
+  uint32_t height,
+  int arity,
+  poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
+  merkle::TreeBuilderConfig& config);
+
+extern "C" cudaError_t bls12_377_mul_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
+
+extern "C" cudaError_t bls12_377_add_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
+
+extern "C" cudaError_t bls12_377_sub_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
+
+extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
+  const bls12_377::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  bls12_377::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
+
+extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
+  bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_initialize_domain(
+  bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t bls12_377_ntt_cuda(
+  const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::scalar_t* output);
+
+extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -0,0 +1,132 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef BLS12_381_API_H
+#define BLS12_381_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "curves/params/bls12_381.cuh"
+#include "ntt/ntt.cuh"
+#include "msm/msm.cuh"
+#include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
+  bls12_381::g2_affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bls12_381::g2_affine_t* output_bases);
+
+extern "C" cudaError_t bls12_381_g2_msm_cuda(
+  const bls12_381::scalar_t* scalars, const bls12_381::g2_affine_t* points, int msm_size, msm::MSMConfig& config, bls12_381::g2_projective_t* out);
+
+extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
+  bls12_381::affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bls12_381::affine_t* output_bases);
+
+extern "C" cudaError_t bls12_381_msm_cuda(
+  const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_381::projective_t* out);
+
+extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
+
+extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
+
+extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
+
+extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery(
+  bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery(
+  bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_ecntt_cuda(
+  const bls12_381::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::projective_t* output);
+
+extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2);
+
+extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out);
+
+extern "C" void bls12_381_generate_projective_points(bls12_381::projective_t* points, int size);
+
+extern "C" void bls12_381_generate_affine_points(bls12_381::affine_t* points, int size);
+
+extern "C" cudaError_t bls12_381_affine_convert_montgomery(
+  bls12_381::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_projective_convert_montgomery(
+  bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(
+  int arity,
+  int full_rounds_half,
+  int partial_rounds,
+  const bls12_381::scalar_t* constants,
+  device_context::DeviceContext& ctx,
+  poseidon::PoseidonConstants<bls12_381::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(
+  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_381::scalar_t>* constants);
+
+extern "C" cudaError_t bls12_381_poseidon_hash_cuda(
+  bls12_381::scalar_t* input,
+  bls12_381::scalar_t* output,
+  int number_of_states,
+  int arity,
+  const poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
+  poseidon::PoseidonConfig& config);
+
+extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree(
+  const bls12_381::scalar_t* leaves,
+  bls12_381::scalar_t* digests,
+  uint32_t height,
+  int arity,
+  poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
+  merkle::TreeBuilderConfig& config);
+
+extern "C" cudaError_t bls12_381_mul_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
+
+extern "C" cudaError_t bls12_381_add_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
+
+extern "C" cudaError_t bls12_381_sub_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
+
+extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
+  const bls12_381::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  bls12_381::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
+
+extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
+  bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_initialize_domain(
+  bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t bls12_381_ntt_cuda(
+  const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::scalar_t* output);
+
+extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -0,0 +1,132 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef BN254_API_H
+#define BN254_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "curves/params/bn254.cuh"
+#include "ntt/ntt.cuh"
+#include "msm/msm.cuh"
+#include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
+  bn254::g2_affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bn254::g2_affine_t* output_bases);
+
+extern "C" cudaError_t bn254_g2_msm_cuda(
+  const bn254::scalar_t* scalars, const bn254::g2_affine_t* points, int msm_size, msm::MSMConfig& config, bn254::g2_projective_t* out);
+
+extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
+  bn254::affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bn254::affine_t* output_bases);
+
+extern "C" cudaError_t bn254_msm_cuda(
+  const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, msm::MSMConfig& config, bn254::projective_t* out);
+
+extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
+
+extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
+
+extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
+
+extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bn254_g2_affine_convert_montgomery(
+  bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_g2_projective_convert_montgomery(
+  bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_ecntt_cuda(
+  const bn254::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::projective_t* output);
+
+extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2);
+
+extern "C" void bn254_to_affine(bn254::projective_t* point, bn254::affine_t* point_out);
+
+extern "C" void bn254_generate_projective_points(bn254::projective_t* points, int size);
+
+extern "C" void bn254_generate_affine_points(bn254::affine_t* points, int size);
+
+extern "C" cudaError_t bn254_affine_convert_montgomery(
+  bn254::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_projective_convert_montgomery(
+  bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
+  int arity,
+  int full_rounds_half,
+  int partial_rounds,
+  const bn254::scalar_t* constants,
+  device_context::DeviceContext& ctx,
+  poseidon::PoseidonConstants<bn254::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bn254_init_optimized_poseidon_constants_cuda(
+  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bn254::scalar_t>* constants);
+
+extern "C" cudaError_t bn254_poseidon_hash_cuda(
+  bn254::scalar_t* input,
+  bn254::scalar_t* output,
+  int number_of_states,
+  int arity,
+  const poseidon::PoseidonConstants<bn254::scalar_t>& constants,
+  poseidon::PoseidonConfig& config);
+
+extern "C" cudaError_t bn254_build_poseidon_merkle_tree(
+  const bn254::scalar_t* leaves,
+  bn254::scalar_t* digests,
+  uint32_t height,
+  int arity,
+  poseidon::PoseidonConstants<bn254::scalar_t>& constants,
+  merkle::TreeBuilderConfig& config);
+
+extern "C" cudaError_t bn254_mul_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
+
+extern "C" cudaError_t bn254_add_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
+
+extern "C" cudaError_t bn254_sub_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
+
+extern "C" cudaError_t bn254_transpose_matrix_cuda(
+  const bn254::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  bn254::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
+
+extern "C" cudaError_t bn254_scalar_convert_montgomery(
+  bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_initialize_domain(
+  bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t bn254_ntt_cuda(
+  const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::scalar_t* output);
+
+extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -0,0 +1,132 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef BW6_761_API_H
+#define BW6_761_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "curves/params/bw6_761.cuh"
+#include "ntt/ntt.cuh"
+#include "msm/msm.cuh"
+#include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
+  bw6_761::g2_affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bw6_761::g2_affine_t* output_bases);
+
+extern "C" cudaError_t bw6_761_g2_msm_cuda(
+  const bw6_761::scalar_t* scalars, const bw6_761::g2_affine_t* points, int msm_size, msm::MSMConfig& config, bw6_761::g2_projective_t* out);
+
+extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
+  bw6_761::affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  bw6_761::affine_t* output_bases);
+
+extern "C" cudaError_t bw6_761_msm_cuda(
+  const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, msm::MSMConfig& config, bw6_761::projective_t* out);
+
+extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
+
+extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
+
+extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
+
+extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery(
+  bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery(
+  bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_ecntt_cuda(
+  const bw6_761::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::projective_t* output);
+
+extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2);
+
+extern "C" void bw6_761_to_affine(bw6_761::projective_t* point, bw6_761::affine_t* point_out);
+
+extern "C" void bw6_761_generate_projective_points(bw6_761::projective_t* points, int size);
+
+extern "C" void bw6_761_generate_affine_points(bw6_761::affine_t* points, int size);
+
+extern "C" cudaError_t bw6_761_affine_convert_montgomery(
+  bw6_761::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_projective_convert_montgomery(
+  bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(
+  int arity,
+  int full_rounds_half,
+  int partial_rounds,
+  const bw6_761::scalar_t* constants,
+  device_context::DeviceContext& ctx,
+  poseidon::PoseidonConstants<bw6_761::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(
+  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bw6_761::scalar_t>* constants);
+
+extern "C" cudaError_t bw6_761_poseidon_hash_cuda(
+  bw6_761::scalar_t* input,
+  bw6_761::scalar_t* output,
+  int number_of_states,
+  int arity,
+  const poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
+  poseidon::PoseidonConfig& config);
+
+extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree(
+  const bw6_761::scalar_t* leaves,
+  bw6_761::scalar_t* digests,
+  uint32_t height,
+  int arity,
+  poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
+  merkle::TreeBuilderConfig& config);
+
+extern "C" cudaError_t bw6_761_mul_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
+
+extern "C" cudaError_t bw6_761_add_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
+
+extern "C" cudaError_t bw6_761_sub_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
+
+extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
+  const bw6_761::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  bw6_761::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
+
+extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
+  bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_initialize_domain(
+  bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t bw6_761_ntt_cuda(
+  const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::scalar_t* output);
+
+extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -0,0 +1,94 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef GRUMPKIN_API_H
+#define GRUMPKIN_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "curves/params/grumpkin.cuh"
+#include "msm/msm.cuh"
+#include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+
+extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
+  grumpkin::affine_t* bases,
+  int bases_size,
+  int precompute_factor,
+  int _c,
+  bool are_bases_on_device,
+  device_context::DeviceContext& ctx,
+  grumpkin::affine_t* output_bases);
+
+extern "C" cudaError_t grumpkin_msm_cuda(
+  const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, msm::MSMConfig& config, grumpkin::projective_t* out);
+
+extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2);
+
+extern "C" void grumpkin_to_affine(grumpkin::projective_t* point, grumpkin::affine_t* point_out);
+
+extern "C" void grumpkin_generate_projective_points(grumpkin::projective_t* points, int size);
+
+extern "C" void grumpkin_generate_affine_points(grumpkin::affine_t* points, int size);
+
+extern "C" cudaError_t grumpkin_affine_convert_montgomery(
+  grumpkin::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_projective_convert_montgomery(
+  grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(
+  int arity,
+  int full_rounds_half,
+  int partial_rounds,
+  const grumpkin::scalar_t* constants,
+  device_context::DeviceContext& ctx,
+  poseidon::PoseidonConstants<grumpkin::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(
+  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<grumpkin::scalar_t>* constants);
+
+extern "C" cudaError_t grumpkin_poseidon_hash_cuda(
+  grumpkin::scalar_t* input,
+  grumpkin::scalar_t* output,
+  int number_of_states,
+  int arity,
+  const poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
+  poseidon::PoseidonConfig& config);
+
+extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree(
+  const grumpkin::scalar_t* leaves,
+  grumpkin::scalar_t* digests,
+  uint32_t height,
+  int arity,
+  poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
+  merkle::TreeBuilderConfig& config);
+
+extern "C" cudaError_t grumpkin_mul_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
+
+extern "C" cudaError_t grumpkin_add_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
+
+extern "C" cudaError_t grumpkin_sub_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
+
+extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
+  const grumpkin::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  grumpkin::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
+
+extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
+  grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/hash.h
+++ b/icicle/include/api/hash.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#ifndef HASH_API_H
+#define HASH_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "hash/keccak/keccak.cuh"
+
+extern "C" cudaError_t
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+
+extern "C" cudaError_t
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+
+#endif
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -0,0 +1,47 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef STARK252_API_H
+#define STARK252_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "fields/stark_fields/stark252.cuh"
+#include "ntt/ntt.cuh"
+#include "vec_ops/vec_ops.cuh"
+
+extern "C" cudaError_t stark252_mul_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_add_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_sub_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_transpose_matrix_cuda(
+  const stark252::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  stark252::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
+
+extern "C" cudaError_t stark252_scalar_convert_montgomery(
+  stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t stark252_initialize_domain(
+  stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t stark252_ntt_cuda(
+  const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
+
+extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/templates/curves/curve.h
+++ b/icicle/include/api/templates/curves/curve.h
@@ -0,0 +1,13 @@
+extern "C" bool ${CURVE}_eq(${CURVE}::projective_t* point1, ${CURVE}::projective_t* point2);
+
+extern "C" void ${CURVE}_to_affine(${CURVE}::projective_t* point, ${CURVE}::affine_t* point_out);
+
+extern "C" void ${CURVE}_generate_projective_points(${CURVE}::projective_t* points, int size);
+
+extern "C" void ${CURVE}_generate_affine_points(${CURVE}::affine_t* points, int size);
+
+extern "C" cudaError_t ${CURVE}_affine_convert_montgomery(
+  ${CURVE}::affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t ${CURVE}_projective_convert_montgomery(
+  ${CURVE}::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
IdoAtlas	42cffb1c88	vec ops compiles	2024-05-12 14:01:17 +03:00
IdoAtlas	d3274a9eaa	poseidon compiles	2024-05-12 13:43:47 +03:00
IdoAtlas	d31a7019fe	polynomial compiles	2024-05-12 13:28:11 +03:00
IdoAtlas	84a0d3c348	ntt compiled	2024-05-12 12:55:07 +03:00
IdoAtlas	eb87970325	msm compiled	2024-05-09 11:31:22 +03:00
IdoAtlas	a9081aabbf	build hash	2024-05-09 10:59:44 +03:00
IdoAtlas	b564c6670d	curve build	2024-05-09 10:45:01 +03:00
Otsar	1f9f3f13ea	cuda cpu empty mock, field compiles	2024-05-09 10:02:59 +03:00
Stas	41294b12e0	Stas/example poly (#434 ) ## Describe the changes Added examples for Poly API --------- Co-authored-by: Yuval Shekel <yshekel@gmail.com>	2024-05-07 11:52:13 +03:00
Jeremy Felder	6134cfe177	[DOCS]: Tidy up docs (#502 ) ## Describe the changes This PR tidies up docs and updates golang build instructions	2024-05-06 15:35:19 +03:00
VitaliiH	34f0212c0d	rust classic benches with Criterion for ecntt/msm/ntt (#499 ) Rust idiomatic benches for EC NTT, NTT, MSM	2024-05-05 10:28:41 +02:00
release-bot	f6758f3447	Bump rust crates' version icicle-babybear@2.1.0 icicle-bls12-377@2.1.0 icicle-bls12-381@2.1.0 icicle-bn254@2.1.0 icicle-bw6-761@2.1.0 icicle-core@2.1.0 icicle-cuda-runtime@2.1.0 icicle-grumpkin@2.1.0 icicle-hash@2.1.0 icicle-stark252@2.1.0 Generated by cargo-workspaces	2024-05-01 20:11:42 +00:00
nonam3e	e2ad621f97	Nonam3e/golang/keccak (#496 ) ## Describe the changes This PR adds keccak bindings + passes cfg as reference in keccak cuda functions	2024-05-01 14:08:33 +03:00
PatStiles	bdc3da98d6	FEAT(stark252 field): Adds Stark252 curve (#494 ) ## Describe the changes Adds support for the stark252 base field.	2024-05-01 14:08:05 +03:00
yshekel	36e288c1fa	fix: bug regarding MixedRadix coset (I)NTT for NM/MN ordering (#497 ) The bug is in how twiddles array is indexed when multiplied by a mixed (M) vector to implement (I)NTT on cosets. The fix is to use the DIF-digit-reverse to compute the index of the element in the natural (N) vector that moved to index 'i' in the M vector. This is emulating a DIT-digit-reverse (which is mixing like a DIF-compute) reorder of the twiddles array and element-wise multiplication without reordering the twiddles memory.	2024-04-25 18:09:27 +03:00
nonam3e	f8d15e2613	update imports in golang bindings (#498 ) ## Describe the changes This PR updates imports in golang bindings to the v2 version	2024-04-25 03:46:14 +07:00
release-bot	14b39b57cc	Bump rust crates' version icicle-babybear@2.0.1 icicle-bls12-377@2.0.1 icicle-bls12-381@2.0.1 icicle-bn254@2.0.1 icicle-bw6-761@2.0.1 icicle-core@2.0.1 icicle-cuda-runtime@2.0.1 icicle-grumpkin@2.0.1 icicle-hash@2.0.1 Generated by cargo-workspaces	2024-04-24 07:13:05 +00:00
Jeremy Felder	999167afe1	[PATCH]: Update module with v2 versioning (#495 ) ## Describe the changes This PR fixes the issue of v2 ICICLE not being discovered by Go's packaging service by adding the required "v2" to the module path: https://go.dev/doc/modules/release-workflow#breaking	2024-04-24 10:09:45 +03:00
release-bot	ff374fcac7	Bump rust crates' version icicle-babybear@2.0.0 icicle-bls12-377@2.0.0 icicle-bls12-381@2.0.0 icicle-bn254@2.0.0 icicle-bw6-761@2.0.0 icicle-core@2.0.0 icicle-cuda-runtime@2.0.0 icicle-grumpkin@2.0.0 icicle-hash@2.0.0 Generated by cargo-workspaces	2024-04-23 02:30:18 +00:00
ChickenLover	7265d18d48	ICICLE V2 Release (#492 ) This PR introduces major updates for ICICLE Core, Rust and Golang bindings --------- Co-authored-by: Yuval Shekel <yshekel@gmail.com> Co-authored-by: DmytroTym <dmytrotym1@gmail.com> Co-authored-by: Otsar <122266060+Otsar-Raikou@users.noreply.github.com> Co-authored-by: VitaliiH <vhnatyk@gmail.com> Co-authored-by: release-bot <release-bot@ingonyama.com> Co-authored-by: Stas <spolonsky@icloud.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: ImmanuelSegol <3ditds@gmail.com> Co-authored-by: JimmyHongjichuan <45908291+JimmyHongjichuan@users.noreply.github.com> Co-authored-by: pierre <pierreuu@gmail.com> Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: nonam3e <timur@ingonyama.com> Co-authored-by: Vlad <88586482+vladfdp@users.noreply.github.com> Co-authored-by: LeonHibnik <leon@ingonyama.com> Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com> Co-authored-by: vladfdp <vlad.heintz@gmail.com>	2024-04-23 05:26:40 +03:00
release-bot	a1dc0539ce	Bump rust crates' version icicle-bls12-377@1.10.1 icicle-bls12-381@1.10.1 icicle-bn254@1.10.1 icicle-bw6-761@1.10.1 icicle-core@1.10.1 icicle-cuda-runtime@1.10.1 icicle-grumpkin@1.10.1 Generated by cargo-workspaces	2024-04-11 07:56:32 +00:00
Jeremy Felder	cda806ff0c	[PATCH]: Fix underflow and div by 0 (#471 ) ## Describe the changes This PR fixes an underflow in `get_optimal_c` when bitsize is less than 16 and adds `max(1,NUM_THREADS)` to prevent div by 0 when calculating NUM_THREADS	2024-04-11 10:45:34 +03:00
release-bot	8498a962f9	Bump rust crates' version icicle-bls12-377@1.10.0 icicle-bls12-381@1.10.0 icicle-bn254@1.10.0 icicle-bw6-761@1.10.0 icicle-core@1.10.0 icicle-cuda-runtime@1.10.0 icicle-grumpkin@1.10.0 Generated by cargo-workspaces	2024-04-09 10:02:34 +00:00
Leon Hibnik	a7b0dc40c1	[FEAT] ReleaseDomain API (#465 ) ## Describe the changes This PR adds a NTT ReleaseDomain API in Golang and Rust ## Linked Issues Resolves # --------- Co-authored-by: Yuval Shekel <yshekel@gmail.com>	2024-04-09 12:58:19 +03:00
Vlad	4a35eece51	transpose kernel in vec_ops and rust binding (#462 ) ## Describe the changes This PR adds an extern C link to the transpose kernel, now in vec_ops.cu. Also Rust binding, and I updated the test check_ntt_batch to use the new transpose function. The test passes. ## Linked Issues Resolves # --------- Co-authored-by: LeonHibnik <leon@ingonyama.com>	2024-04-09 08:47:33 +03:00
VitaliiH	4c9b3c00a5	Devmode to Reduce compilation time (including G2 and ECNTT) (#395 ) devmode to reduce compilation time	2024-04-09 06:09:04 +02:00
Jeremy Felder	c6719167ce	[FEAT]: golang device slice ranges (#463 ) ## Describe the changes This PR adds the capability to slice a DeviceSlice, allowing portions of data that are already on the device to be reused. Additionally, this PR removes the need for a HostSlice underlying type to implement a Size function and uses unsafe.Sizeof instead. This together with #407 will allow direct usage of gnark-crypto types with HostSlice without the need for converting to ICICLE types --------- Co-authored-by: nonam3e <timur@ingonyama.com>	2024-04-08 19:42:03 +03:00
Leon Hibnik	cd3769d6b7	Fix Golang TestNttDeviceAsync (#461 ) ## Describe the changes This PR fixes TestNttDeviceAsync by adding a missing call to initDomain ## Linked Issues Resolves #	2024-04-08 17:47:10 +03:00
DmytroTym	b93b1d0aaf	NTT inplace in Rust (#453 ) ## Describe the changes Due to Rust's ownership rules, we can't run NTT inplace using the [`ntt`](https://github.com/ingonyama-zk/icicle/blob/v1.9.1/wrappers/rust/icicle-core/src/ntt/mod.rs#L139) function. Which is why we saw a need to add a separate function a couple of times. Incidentally an issue with radix-2 NTT was found when ran inplace, `__syncthreads()` was used in reverse order kernel as if it was a global barrier for all blocks and not block-local one. Thus data race happened that is fixed by this PR.	2024-04-08 10:04:04 +03:00
Leon Hibnik	6a96eef8dc	add golang multigpu to sidebar (#449 ) This PR adds multi GPU golang documentation to dev sidebar	2024-04-08 09:20:29 +03:00
JimmyHongjichuan	95ab6de059	fix: use the log2 in lib std explicitly to prevent makefile from link… (#459 ) …ing other log2 func ## Describe the changes This PR adds "std" as prefix on log2 function of icicle/appUtils/msm/msm.cu to explicitly use std::log2 for MSM calculatation. ## Linked Issues https://github.com/ingonyama-zk/icicle/issues/458 Resolves # Co-authored-by: pierre <pierreuu@gmail.com>	2024-04-07 07:58:53 +03:00
Yuval Shekel	9c9311bee0	golang multi-device MSM test temporarily disabled due to issues related to golang tests env	2024-04-04 23:23:18 +03:00
Yuval Shekel	406020bda6	fix: NTT release domain linkage	2024-04-04 23:23:18 +03:00
release-bot	25ac705c3b	Bump rust crates' version icicle-bls12-377@1.9.1 icicle-bls12-381@1.9.1 icicle-bn254@1.9.1 icicle-bw6-761@1.9.1 icicle-core@1.9.1 icicle-cuda-runtime@1.9.1 icicle-grumpkin@1.9.1 Generated by cargo-workspaces	2024-03-27 19:00:07 +00:00
VitaliiH	ef757e8210	hotfix for large ecntt (#448 ) hotfix for large ECNTTs	2024-03-27 18:31:50 +01:00
Otsar	2c1431d904	Update Hall of fame in 'README.md' (#445 ) Added v1.8's contributors to hall of fame	2024-03-27 16:57:41 +02:00
ImmanuelSegol	77ebc4848e	Docs 1.8 (#436 )	2024-03-25 08:54:17 -04:00
Yuval Shekel	919ff42f49	fix: NTT input is const	2024-03-24 16:26:10 +02:00
release-bot	a1ff989740	Bump rust crates' version icicle-bls12-377@1.9.0 icicle-bls12-381@1.9.0 icicle-bn254@1.9.0 icicle-bw6-761@1.9.0 icicle-core@1.9.0 icicle-cuda-runtime@1.9.0 icicle-grumpkin@1.9.0 Generated by cargo-workspaces	2024-03-21 07:11:47 +00:00
Otsar	1f2144a57c	Removed "machines using ICICLE" static badge (#442 )	2024-03-21 09:04:19 +02:00
Jeremy Felder	db4c07dcaf	Golang bindings for ECNTT (#433 )	2024-03-21 09:04:00 +02:00
ChickenLover	d4f39efea3	Add Keccak hash function (#435 ) This PR adds support for Keccak-256 and Keccak-512. It only adds them in c++. There is no way of adding rust or golang wrappers rn as it requires having an `icicle-common` create / mod	2024-03-20 22:30:19 +02:00
Yuval Shekel	7293058246	fix: (golang) MSM multi device test reset to original device after test is done	2024-03-20 16:27:11 +02:00
Yuval Shekel	03136f1074	fix: (golang) add missing NttAlgorithm field in NTTConfig	2024-03-20 16:27:11 +02:00
Yuval Shekel	3ef0d0c66e	MSM scalars and points params are const - This is required to be able to compute MSM on polynomial coefficients that are accessible by const only.	2024-03-20 16:27:11 +02:00
Stas	0dff1f9302	Use multi-threaded CUDA compilation to spped up compilation (#439 ) ## Describe the changes Speed up CUDA c++ compile time using multi-threaded compilation (--split-compile flag). The tests on 8 core machine show ~2x acceleration. ## Linked Issues Compiling c++ takes long time	2024-03-18 16:40:30 -04:00
ChickenLover	0d806d96ca	tidy (#437 )	2024-03-19 00:59:10 +07:00