Bump rust crates' version

icicle-babybear@2.5.0 icicle-bls12-377@2.5.0 icicle-bls12-381@2.5.0 icicle-bn254@2.5.0 icicle-bw6-761@2.5.0 icicle-core@2.5.0 icicle-cuda-runtime@2.5.0 icicle-grumpkin@2.5.0 icicle-hash@2.5.0 icicle-stark252@2.5.0 Generated by cargo-workspaces
MSM - supporting all window sizes (#534 )
2026-01-13 01:17:57 -05:00 · 2024-06-17 13:17:24 +00:00 · 2024-06-17 15:57:24 +03:00 · 2024-06-17 15:44:15 +03:00 · 2024-06-17 12:16:26 +03:00 · 2024-06-16 18:35:42 +03:00
369 changed files with 13355 additions and 1728 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -3,8 +3,11 @@ golang:
  - wrappers/golang/**/*.h
  - wrappers/golang/**/*.tmpl
  - go.mod
+  - .github/workflows/golang.yml
 rust:
  - wrappers/rust/**/*
+  - '!wrappers/rust/README.md'
+  - .github/workflows/rust.yml
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
@@ -12,4 +15,11 @@ cpp:
  - icicle/**/*.hpp
  - icicle/**/*.c
  - icicle/**/*.h
-  - icicle/CMakeLists.txt
+  - icicle/CMakeLists.txt
+  - .github/workflows/cpp_cuda.yml  
+  - icicle/cmake/Common.cmake
+  - icicle/cmake/CurvesCommon.cmake
+  - icicle/cmake/FieldsCommon.cmake
+examples:
+  - examples/**/*
+  - .github/workflows/examples.yml
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -12,6 +12,9 @@ on:
      cpp_cuda:
        description: "Flag for if C++/CUDA files changed"
        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
+      examples:
+        description: "Flag for if example files changed"
+        value: ${{ jobs.check-changed-files.outputs.examples }}

 jobs:
  check-changed-files:
@@ -21,6 +24,7 @@ jobs:
      golang: ${{ steps.changed_files.outputs.golang }}
      rust: ${{ steps.changed_files.outputs.rust }}
      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+      examples: ${{ steps.changed_files.outputs.examples }}
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -37,3 +41,4 @@ jobs:
        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "examples=${{ steps.changed-files-yaml.outputs.examples_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -35,7 +35,18 @@ jobs:
    needs: [check-changed-files, check-format]
    strategy:
      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
+        curve:
+          - name: bn254
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bls12_381
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bls12_377
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bw6_761
+            build_args: -DG2=ON -DECNTT=ON
+          - name: grumpkin
+            build_args:
+
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -44,7 +55,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -S . -B build
        cmake --build build -j
    - name: Run C++ curve Tests
      working-directory: ./icicle/build/tests
@@ -57,7 +68,11 @@ jobs:
    needs: [check-changed-files, check-format]
    strategy:
      matrix:
-        field: [babybear]
+        field: 
+          - name: babybear
+            build_args: -DEXT_FIELD=ON
+          - name: stark252
+            build_args: -DEXT_FIELD=OFF
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -66,7 +81,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field.name }} ${{ matrix.field.build_args }} -S . -B build
        cmake --build build -j
    - name: Run C++ field Tests
      working-directory: ./icicle/build/tests
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -33,7 +33,7 @@ jobs:
      uses: actions/checkout@v4
    - name: c++ examples
      working-directory: ./examples/c++
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +47,7 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
-      if: needs.check-changed-files.outputs.rust == 'true'
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -34,7 +34,7 @@ jobs:
      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi

  build-curves-linux:
-    name: Build curves on Linux
+    name: Build and test curves on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
@@ -60,19 +60,18 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
+      # builds a single curve with the curve's specified build args
+      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/curves
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
-          icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
-        retention-days: 1
+      run: |
+        CURVE=$(echo ${{ matrix.curve.name }} | sed -e 's/_//g')
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test ./$CURVE/tests -count=1 -failfast -p 2 -timeout 60m -v
 
  build-fields-linux:
-    name: Build fields on Linux
+    name: Build and test fields on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
@@ -90,20 +89,25 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
+      # builds a single field with the fields specified build args
+      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/fields
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
-        retention-days: 1
-  
-  test-linux:
-    name: Test on Linux
+      run: |
+        FIELD=$(echo ${{ matrix.field.name }} | sed -e 's/_//g')
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test ./$FIELD/tests -count=1 -failfast -p 2 -timeout 60m -v
+    
+  build-hashes-linux:
+    name: Build and test hashes on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-curves-linux, build-fields-linux]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        hash:
+          - name: keccak
+            build_args:
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -111,20 +115,18 @@ jobs:
      uses: actions/setup-go@v5
      with:
        go-version: '1.20.0'
-    - name: Download ICICLE lib artifacts
-      uses: actions/download-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        path: ./icicle/build/lib
-        merge-multiple: true
-    - name: Run Tests
+    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # -count ensures the test results are not cached
-      # -p controls the number of programs that can be run in parallel
+      # builds a single hash algorithm with the hash's specified build args
+      run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/hash
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
+        HASH=$(echo ${{ matrix.hash.name }} | sed -e 's/_//g')
        export CPATH=$CPATH:/usr/local/cuda/include
-        go test ./... -count=1 -failfast -p 2 -timeout 60m
+        go test ./$HASH/tests -count=1 -failfast -p 2 -timeout 60m -v
  
  # TODO: bw6 on windows requires more memory than the standard runner has
  # Add a large runner and then enable this job
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -60,10 +60,10 @@ jobs:
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
      # Running tests from the root workspace will run all workspace members' tests by default
      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed sperately
+      # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
      run: |
-        cargo test --workspace --exclude icicle-babybear --release --verbose --features=g2 -- --test-threads=2 --ignored
-        cargo test --workspace --exclude icicle-babybear --release --verbose --features=g2 -- --test-threads=2
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2

    - name: Run baby bear tests
      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
@@ -72,26 +72,34 @@ jobs:
        cargo test --release --verbose -- --ignored
        cargo test --release --verbose

-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
+    - name: Run stark252 tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-stark252
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
+      run: |
+        cargo test --release --verbose -- --ignored
+        cargo test --release --verbose
+
+  # build-windows:
+  #   name: Build on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v4
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     id: cuda-toolkit
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+  #   - name: Build targets
+  #     working-directory: ./wrappers/rust
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     env:
+  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+  #       CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
+  #     # Building from the root workspace will build all members of the workspace by default
+  #     run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/*'
+      - 'docs/**'

 jobs:
  test-deploy:
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.so
 *.nsys-rep
 *.ncu-rep
+*.sage.py
 **/target
 **/.vscode
 **/.*lock*csv#
@@ -17,5 +18,3 @@
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
 **/build*
-**/icicle/appUtils/large_ntt/work
-icicle/appUtils/large_ntt/work/test_ntt
--- a/README.md
+++ b/README.md
@@ -119,6 +119,7 @@ This will ensure our custom hooks are run and will make it easier to follow our
 - [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
 - [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
 - [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile
+- [PatStiles](https://github.com/PatStiles), for adding Stark252 field

 ## Help & Support

--- a/docs/docs/icicle/core.md
+++ b/docs/docs/icicle/core.md
@@ -2,34 +2,54 @@

 ICICLE Core is a library written in C++/CUDA. All the ICICLE primitives are implemented within ICICLE Core.

-The Core is split into logical modules that can be compiled into static libraries using different [strategies](#compilation-strategies). You can then [link](#linking) these libraries with your C++ project or write your own [bindings](#writing-new-bindings-for-icicle) for other programming languages. If you want to use ICICLE with existing bindings please refer to [Rust](/icicle/rust-bindings) / [Golang](/icicle/golang-bindings).
+The Core is split into logical modules that can be compiled into static libraries using different [strategies](#compilation-strategies). You can then [link](#linking) these libraries with your C++ project or write your own [bindings](#writing-new-bindings-for-icicle) for other programming languages. If you want to use ICICLE with existing bindings please refer to the [Rust](/icicle/rust-bindings) or [Golang](/icicle/golang-bindings) bindings documentation.
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | [bn254](https://neuromancer.sk/std/bn/bn254) | [bls12-377](https://neuromancer.sk/std/bls/BLS12-377) | [bls12-381](https://neuromancer.sk/std/bls/BLS12-381) | [bw6-761](https://eprint.iacr.org/2020/351) | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| [MSM][MSM_DOCS] | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [NTT][NTT_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [VecOps][VECOPS_CODE] | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Polynomials][POLY_DOCS] | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [Poseidon](primitives/poseidon) | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Merkle Tree](primitives/poseidon#the-tree-builder) | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+### Supported fields and operations
+
+| Operation\Field | [babybear](https://eprint.iacr.org/2023/824.pdf) | [Stark252](https://docs.starknet.io/documentation/architecture_and_concepts/Cryptography/p-value/) |
+| --- | :---: | :---: |
+| [VecOps][VECOPS_CODE] | ✅ | ✅ |
+| [Polynomials][POLY_DOCS] | ✅ | ✅ |
+| [NTT][NTT_DOCS] | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
+
+### Supported hashes
+
+| Hash | Sizes |
+| --- | :---: |
+| Keccak | 256, 512 |

 ## Compilation strategies

-Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields  - you just compile ICICLE into multiple static libraries. It's that simple. Currently, the following choices are supported:
+Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields, you compile ICICLE once per curve or field that is needed. It's that simple. Currently, the following choices are supported:

- - [Field mode](#compiling-for-a-field) - used for STARK fields like BabyBear / Mersenne / Goldilocks. Includes field arithmetic, NTT, Poseidon, Extension fields and other primitives.
- - [Curve mode](#compiling-for-a-curve) - used for SNARK curves like BN254/ BLS curves / Grumpkin / etc. Curve mode is built upon field mode, so it includes everything that field does. It also includes curve operations / MSM / ECNTT / G2 and other curve-related primitives.
+- [Field mode][COMPILE_FIELD_MODE] - used for STARK fields like BabyBear / Mersenne / Goldilocks. Includes field arithmetic, NTT, Poseidon, Extension fields and other primitives.
+- [Curve mode][COMPILE_CURVE_MODE] - used for SNARK curves like BN254 / BLS curves / Grumpkin / etc. Curve mode is built upon field mode, so it includes everything that field does It also includes curve operations / MSM / ECNTT / G2 and other curve-related primitives.

 :::info

-If you only want to use curve's scalar/base field, you still need to go with a curve mode. You can disable MSM with [options](#compilation-options)
+If you only want to use a curve's scalar or base field, you still need to use curve mode. You can disable MSM with [options](#compilation-options)

 :::

 ### Compiling for a field

-ICICLE supports the following STARK fields:
- - [BabyBear](https://eprint.iacr.org/2023/824.pdf)
-
-Field mode includes:
- - [Field arithmetic](https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/fields/field.cuh) - field multiplication, addition, subtraction
- - [NTT](icicle/primitives/ntt) - FFT / iFFT
- - [Poseidon Hash](icicle/primitives/poseidon)
- - [Vector operations](https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/vec_ops/vec_ops.cuh)
- - [Polynomial](#) - structs and methods to work with polynomials
-
-You can compile ICICLE for a STARK field using this command:
+You can compile ICICLE for a field using this command:

 ```sh
 cd icicle
@@ -38,24 +58,10 @@ cmake -DFIELD=<FIELD> -S . -B build
 cmake --build build -j
 ```

-Icicle Supports the following `<FIELD>` FIELDS:
- `babybear`
-
 This command will output `libingo_field_<FIELD>.a` into `build/lib`.

 ### Compiling for a curve

-ICICLE supports the following SNARK curves:
- - [BN254](https://neuromancer.sk/std/bn/bn254)
- - [BLS12-377](https://neuromancer.sk/std/bls/BLS12-377)
- - [BLS12-381](https://neuromancer.sk/std/bls/BLS12-381)
- - [BW6-761](https://eprint.iacr.org/2020/351)
- - Grumpkin
-
-Curve mode includes everything you can find in field mode with addition of:
- - [MSM](icicle/primitives/msm) - MSM / Batched MSM
- - [ECNTT](#)
-
 :::note

 Field related primitives will be compiled for the scalar field of the curve
@@ -81,7 +87,7 @@ There exist multiple options that allow you to customize your build or enable ad

 #### EXT_FIELD

-Used only in a [field mode](#compiling-for-a-field) to add Extension field into a build. Adds NTT for the extension field.
+Used only in [field mode][COMPILE_FIELD_MODE] to add an Extension field. Adds all supported field operations for the extension field.

 Default: `OFF`

@@ -89,7 +95,7 @@ Usage: `-DEXT_FIELD=ON`

 #### G2

-Used only in a [curve mode](#compiling-for-a-curve) to add G2 definitions into a build. Also adds G2 MSM.
+Used only in [curve mode][COMPILE_CURVE_MODE] to add G2 definitions. Also adds G2 MSM.

 Default: `OFF`

@@ -97,7 +103,7 @@ Usage: `-DG2=ON`

 #### ECNTT

-Used only in a [curve mode](#compiling-for-a-curve) to add ECNTT function into a build.
+Used only in [curve mode][COMPILE_CURVE_MODE] to add ECNTT function.

 Default: `OFF`

@@ -105,7 +111,7 @@ Usage: `-DECNTT=ON`

 #### MSM

-Used only in a [curve mode](#compiling-for-a-curve) to add MSM function into a build. As MSM takes a lot of time to build, you can disable it with this option to reduce compilation time.
+Used only in [curve mode][COMPILE_CURVE_MODE] to add MSM function. As MSM takes a lot of time to build, you can disable it with this option to reduce compilation time.

 Default: `ON`

@@ -149,14 +155,13 @@ To link ICICLE with your project you first need to compile ICICLE with options o

 Refer to our [c++ examples](https://github.com/ingonyama-zk/icicle/tree/main/examples/c%2B%2B) for more info. Take a look at this [CMakeLists.txt](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/CMakeLists.txt#L22)

-
 ## Writing new bindings for ICICLE

 Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.

 Static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.

-Let's review the [Golang bindings](golang-bindings.md) since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
+Let's review the [Golang bindings][GOLANG_BINDINGS] since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:

 ```go
 /*
@@ -178,4 +183,14 @@ func main() {

 The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.

-If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
+If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
+
+<!-- Begin Links -->
+[GOLANG_BINDINGS]: golang-bindings.md
+[COMPILE_CURVE_MODE]: #compiling-for-a-curve
+[COMPILE_FIELD_MODE]: #compiling-for-a-field
+[NTT_DOCS]: primitives/ntt
+[MSM_DOCS]: primitives/msm
+[POLY_DOCS]: polynomials/overview
+[VECOPS_CODE]: https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/vec_ops/vec_ops.cuh
+<!-- End Links -->
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,7 +1,7 @@
 # Golang bindings

 Golang bindings allow you to use ICICLE as a golang library.
-The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
+The source code for all Golang packages can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).

 The Golang bindings are comprised of multiple packages.

@@ -9,7 +9,7 @@ The Golang bindings are comprised of multiple packages.

 [`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.

-Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
+Each supported curve, field, and hash has its own package which you can find in the respective directories [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang). If your project uses BN254 you only need to import that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).

 ## Using ICICLE Golang bindings in your project

@@ -31,22 +31,30 @@ For a specific commit
 go get github.com/ingonyama-zk/icicle@<commit_id>
 ```

-To build the shared libraries you can run this script:
+To build the shared libraries you can run [this](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/build.sh) script:

-```bash
-./build.sh [-curve=<curve> | -field=<field>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]
+```sh
+./build.sh [-curve=<curve>] [-field=<field>] [-hash=<hash>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]
+
+curve - The name of the curve to build or "all" to build all supported curves
+field - The name of the field to build or "all" to build all supported fields
+hash - The name of the hash to build or "all" to build all supported hashes
+-g2 - Optional - build with G2 enabled 
+-ecntt - Optional - build with ECNTT enabled
+-devmode - Optional - build in devmode
+-help - Optional - Displays usage information
 ```
- **`curve`** - The name of the curve to build or "all" to build all curves
- **`field`** - The name of the field to build or "all" to build all fields
- **`g2`** - Optional - build with G2 enabled 
- **`ecntt`** - Optional - build with ECNTT enabled
- **`devmode`** - Optional - build in devmode
- Usage can be displayed with the flag `-help`
+
+:::note
+
+If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
+
+:::

 To build ICICLE libraries for all supported curves with G2 and ECNTT enabled.

 ```bash
-./build.sh all -g2 -ecntt
+./build.sh -curve=all -g2 -ecntt
 ```

 If you wish to build for a specific curve, for example bn254, without G2 or ECNTT enabled.
@@ -62,8 +70,8 @@ import (
    "github.com/stretchr/testify/assert"
    "testing"

-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 )
 ...
 ```
@@ -73,11 +81,9 @@ import (
 To run all tests, for all curves:

 ```bash
-go test --tags=g2 ./... -count=1
+go test ./... -count=1
 ```

-If you dont want to include g2 tests then drop `--tags=g2`.
-
 If you wish to run test for a specific curve:

 ```bash
@@ -106,3 +112,25 @@ func main() {
 ```

 Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+
+### Supported fields and operations
+
+| Operation\Field | babybear |
+| --- | :---: |
+| VecOps | ✅ |
+| Polynomials | ✅ |
+| NTT | ✅ |
+| Extension Field | ✅ |
--- a/docs/docs/icicle/golang-bindings/ecntt.md
+++ b/docs/docs/icicle/golang-bindings/ecntt.md
@@ -1,9 +1,5 @@
 # ECNTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`
-
 ## ECNTT Method

 The `ECNtt[T any]()` function performs the Elliptic Curve Number Theoretic Transform (EC-NTT) on the input points slice, using the provided dir (direction), cfg (configuration), and stores the results in the results slice.
@@ -12,14 +8,13 @@ The `ECNtt[T any]()` function performs the Elliptic Curve Number Theoretic Trans
 func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
 ```

-### Parameters:
+### Parameters

 - **`points`**: A slice of elliptic curve points (in projective coordinates) that will be transformed. The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.
 - **`dir`**: The direction of the EC-NTT transform, either `core.KForward` or `core.KInverse`.
 - **`cfg`**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
 - **`results`**: A slice that will store the transformed elliptic curve points (in projective coordinates). The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.

-
 ### Return Value

 - **`CudaError`**: A `core.IcicleError` value, which will be `core.IcicleErrorCode(0)` if the EC-NTT operation was successful, or an error if something went wrong.
@@ -68,8 +63,8 @@ func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
 package main

 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 )

 func Main() {
@@ -94,4 +89,4 @@ func Main() {
        panic("ECNTT operation failed")
    }
 }
-```
+```
--- a/docs/docs/icicle/golang-bindings/keccak.md
+++ b/docs/docs/icicle/golang-bindings/keccak.md
@@ -0,0 +1,94 @@
+# Keccak
+
+## Keccak Example
+
+```go
+package main
+
+import (
+	"encoding/hex"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/hash/keccak"
+)
+
+func createHostSliceFromHexString(hexString string) core.HostSlice[uint8] {
+	byteArray, err := hex.DecodeString(hexString)
+	if err != nil {
+		panic("Not a hex string")
+	}
+	return core.HostSliceFromElements([]uint8(byteArray))
+}
+
+func main() {
+	input := createHostSliceFromHexString("1725b6")
+	outHost256 := make(core.HostSlice[uint8], 32)
+
+	cfg := keccak.GetDefaultKeccakConfig()
+	e := keccak.Keccak256(input, int32(input.Len()), 1, outHost256, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak256 hashing failed")
+	}
+
+	outHost512 := make(core.HostSlice[uint8], 64)
+	e = keccak.Keccak512(input, int32(input.Len()), 1, outHost512, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak512 hashing failed")
+	}
+
+    numberOfBlocks := 3
+	outHostBatch256 := make(core.HostSlice[uint8], 32*numberOfBlocks)
+	e = keccak.Keccak256(input, int32(input.Len()/numberOfBlocks), int32(numberOfBlocks), outHostBatch256, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak256 batch hashing failed")
+	}
+}
+```
+
+## Keccak Methods
+
+```go
+func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
+func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
+```
+
+### Parameters
+
+- **`input`**: A slice containing the input data for the Keccak256 hash function. It can reside in either host memory or device memory.
+- **`inputBlockSize`**: An integer specifying the size of the input data for a single hash.
+- **`numberOfBlocks`**: An integer specifying the number of results in the hash batch.
+- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
+- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
+
+### Return Value
+
+- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
+
+## KeccakConfig
+
+The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+
+```go
+type KeccakConfig struct {
+	Ctx                cr.DeviceContext
+	areInputsOnDevice  bool
+	areOutputsOnDevice bool
+	IsAsync            bool
+}
+```
+
+### Fields
+
+- **`Ctx`**: Device context containing details like device id and stream.
+- **`areInputsOnDevice`**: Indicates if input data is located on the device.
+- **`areOutputsOnDevice`**: Indicates if output hash is stored on the device.
+- **`IsAsync`**: If true, runs the Keccak256/Keccak512 operation asynchronously.
+
+### Default Configuration
+
+Use `GetDefaultKeccakConfig` to obtain a default configuration, which can then be customized as needed.
+
+```go
+func GetDefaultKeccakConfig() KeccakConfig
+```
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -2,15 +2,11 @@

 To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
 ## Core package

-## MSM `PrecomputeBases`
+### MSM PrecomputeBases

-`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves. 
+`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.

 #### Description

@@ -42,9 +38,9 @@ package main
 import (
 	"log"

-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
@@ -85,9 +81,9 @@ package main
 import (
 	"log"

-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	g2 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254/g2"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )

 func main() {
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -1,24 +1,20 @@
 # MSM

-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
 ## MSM Example

 ```go
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
 )

 func main() {
 	// Obtain the default MSM configuration.
-	cfg := bn254.GetDefaultMSMConfig()
+	cfg := core.GetDefaultMSMConfig()

 	// Define the size of the problem, here 2^18.
 	size := 1 << 18
@@ -44,7 +40,7 @@ func main() {
 	cfg.IsAsync = true

 	// Perform the MSM operation.
-	e = bn254.Msm(scalars, points, &cfg, out)
+	e = bn254_msm.Msm(scalars, points, &cfg, out)

 	if e != cr.CudaSuccess {
 		panic(e)
@@ -124,7 +120,6 @@ Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be c
 func GetDefaultMSMConfig() MSMConfig
 ```

-
 ## How do I toggle between the supported algorithms?

 When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
@@ -161,13 +156,11 @@ out.Malloc(batchSize*p.Size(), p.Size())

 To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled as described in the [Golang building instructions](../golang-bindings.md#using-icicle-golang-bindings-in-your-project).

-
-
 Now you may import `g2` package of the specified curve.

 ```go
 import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bls254/g2"
+    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )
 ```

@@ -177,13 +170,13 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
-	g2 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254/g2"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )

 func main() {
-	cfg := bn254.GetDefaultMSMConfig()
+	cfg := core.GetDefaultMSMConfig()
 	size := 1 << 12
 	batchSize := 3
 	totalSize := size * batchSize
--- a/docs/docs/icicle/golang-bindings/multi-gpu.md
+++ b/docs/docs/icicle/golang-bindings/multi-gpu.md
@@ -2,8 +2,7 @@

 To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.

-Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
-
+Here we will cover the core multi GPU apis and an [example](#a-multi-gpu-example)

 ## A Multi GPU example

@@ -13,7 +12,6 @@ In this example we will display how you can
 2. For every GPU launch a thread and set an active device per thread.
 3. Execute a MSM on each GPU

-
 ```go
 package main

@@ -21,9 +19,9 @@ import (
 	"fmt"
 	"sync"

-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
@@ -79,13 +77,13 @@ To streamline device management we offer as part of `cuda_runtime` package metho

 Runs a given function on a specific GPU device, ensuring that all CUDA calls within the function are executed on the selected device.

-In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread. 
+In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread.

-`RunOnDevice` was designed to solve this caveat and insure that the goroutine will stay on a specific host thread.
+`RunOnDevice` was designed to solve this caveat and ensure that the goroutine will stay on a specific host thread.

-`RunOnDevice` will lock a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.
+`RunOnDevice` locks a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.

-While the goroutine is locked to the host thread, the Go runtime will not assign other goroutine's to that host thread.
+While the goroutine is locked to the host thread, the Go runtime will not assign other goroutines to that host thread.

 **Parameters:**

@@ -96,7 +94,10 @@ While the goroutine is locked to the host thread, the Go runtime will not assign
 **Behavior:**

 - The function `funcToRun` is executed in a new goroutine that is locked to a specific OS thread to ensure that all CUDA calls within the function target the specified device.
- It's important to note that any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
+
+:::note
+Any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
+:::

 **Example:**

@@ -111,6 +112,10 @@ RunOnDevice(0, func(args ...any) {

 Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.

+:::warning
+This function should not be used directly in conjunction with goroutines. If you want to run multi-gpu scenarios with goroutines you should use [RunOnDevice](#runondevice)
+:::
+
 **Parameters:**

 - **`device int`**: The ID of the device to set as the current device.
@@ -147,4 +152,4 @@ Retrieves the device associated with a given pointer.

 - **`int`**: The device ID associated with the memory pointed to by `ptr`.

-This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
+This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -1,58 +1,54 @@
 # NTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
 ## NTT Example

 ```go
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"

-	"github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+  "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
 )

 func init() {
-	cfg := bn254.GetDefaultNttConfig()
-	initDomain(18, cfg)
+  cfg := bn254.GetDefaultNttConfig()
+  initDomain(18, cfg)
 }

 func initDomain[T any](largestTestSize int, cfg core.NTTConfig[T]) core.IcicleError {
-	rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
-	rou := rouMont.Bits()
-	rouIcicle := bn254.ScalarField{}
+  rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
+  rou := rouMont.Bits()
+  rouIcicle := bn254.ScalarField{}

-	rouIcicle.FromLimbs(rou[:])
-	e := bn254.InitDomain(rouIcicle, cfg.Ctx, false)
-	return e
+  rouIcicle.FromLimbs(rou[:])
+  e := bn254.InitDomain(rouIcicle, cfg.Ctx, false)
+  return e
 }

 func main() {
-	// Obtain the default NTT configuration with a predefined coset generator.
-	cfg := bn254.GetDefaultNttConfig()
+  // Obtain the default NTT configuration with a predefined coset generator.
+  cfg := bn254.GetDefaultNttConfig()

-	// Define the size of the input scalars.
-	size := 1 << 18
+  // Define the size of the input scalars.
+  size := 1 << 18

-	// Generate scalars for the NTT operation.
-	scalars := bn254.GenerateScalars(size)
+  // Generate scalars for the NTT operation.
+  scalars := bn254.GenerateScalars(size)

-	// Set the direction of the NTT (forward or inverse).
-	dir := core.KForward
+  // Set the direction of the NTT (forward or inverse).
+  dir := core.KForward

-	// Allocate memory for the results of the NTT operation.
-	results := make(core.HostSlice[bn254.ScalarField], size)
+  // Allocate memory for the results of the NTT operation.
+  results := make(core.HostSlice[bn254.ScalarField], size)

-	// Perform the NTT operation.
-	err := bn254.Ntt(scalars, dir, &cfg, results)
-	if err.CudaErrorCode != cr.CudaSuccess {
-		panic("NTT operation failed")
-	}
+  // Perform the NTT operation.
+  err := bn254.Ntt(scalars, dir, &cfg, results)
+  if err.CudaErrorCode != cr.CudaSuccess {
+    panic("NTT operation failed")
+  }
 }
 ```

@@ -146,10 +142,10 @@ import (
 )

 func example() {
-    cfg := GetDefaultNttConfig()
-	err := ReleaseDomain(cfg.Ctx)
-    if err != nil {
-        // Handle the error
-    }
+  cfg := GetDefaultNttConfig()
+  err := ReleaseDomain(cfg.Ctx)
+  if err != nil {
+      // Handle the error
+  }
 }
 ```
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -1,12 +1,14 @@
 # Vector Operations

 ## Overview
-Icicle is exposing a number of vector operations which a user can control:
+
+Icicle exposes a number of vector operations which a user can use:
+
 * The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
 * MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix

-
 ## VecOps API Documentation
+
 ### Example

 #### Vector addition
@@ -15,9 +17,9 @@ Icicle is exposing a number of vector operations which a user can control:
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
@@ -41,9 +43,9 @@ func main() {
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
@@ -67,9 +69,9 @@ func main() {
 package main

 import (
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

 func main() {
@@ -183,4 +185,4 @@ if err.IcicleErrorCode != core.IcicleErrorCode(0) {
 // ...
 ```

-In this example, the `TransposeMatrix` function is used to transpose a 5x4 matrix stored in a 1D slice. The input and output slices are stored on the host (CPU), and the operation is executed synchronously.
+In this example, the `TransposeMatrix` function is used to transpose a 5x4 matrix stored in a 1D slice. The input and output slices are stored on the host (CPU), and the operation is executed synchronously.
--- a/docs/docs/icicle/introduction.md
+++ b/docs/docs/icicle/introduction.md
@@ -165,7 +165,36 @@ cargo bench

 #### ICICLE Golang

-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
+The Golang bindings require compiling ICICLE Core first. We supply a [build script](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/build.sh) to help build what you need.
+
+Script usage:
+
+```sh
+./build.sh [-curve=<curve>] [-field=<field>] [-hash=<hash>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]
+
+curve - The name of the curve to build or "all" to build all supported curves
+field - The name of the field to build or "all" to build all supported fields
+hash - The name of the hash to build or "all" to build all supported hashes
+-g2 - Optional - build with G2 enabled 
+-ecntt - Optional - build with ECNTT enabled
+-devmode - Optional - build in devmode
+```
+
+:::note
+
+If more than one curve or more than one field or more than one hash is supplied, the last one supplied will be built
+
+:::
+
+Once the library has been built, you can use and test the Golang bindings.
+
+To test a specific curve, field or hash, change to it's directory and then run:
+
+```sh
+go test ./tests -count=1 -failfast -timeout 60m -p 2 -v
+```
+
+You will be able to see each test that runs, how long it takes and whether it passed or failed

 ### Running ICICLE examples

@@ -185,8 +214,8 @@ Read through the compile.sh and CMakeLists.txt to understand how to link your ow

 :::

-
 #### Running with Docker
+
 In each example directory, ZK-container files are located in a subdirectory `.devcontainer`.

 ```sh
@@ -215,4 +244,4 @@ Inside the container you can run the same commands:
 ./run.sh
 ```

-You can now experiment with our other examples, perhaps try to run a rust or golang example next.
+You can now experiment with our other examples, perhaps try to run a rust or golang example next.
--- a/docs/docs/icicle/multi-gpu.md
+++ b/docs/docs/icicle/multi-gpu.md
@@ -2,7 +2,7 @@

 :::info

-If you are looking for the Multi GPU API documentation refer here for [Rust](./rust-bindings/multi-gpu.md).
+If you are looking for the Multi GPU API documentation refer [here](./rust-bindings/multi-gpu.md) for Rust and [here](./golang-bindings/multi-gpu.md) for Golang.

 :::

@@ -10,12 +10,11 @@ One common challenge with Zero-Knowledge computation is managing the large input

 Multi-GPU programming involves developing software to operate across multiple GPU devices. Lets first explore different approaches to Multi-GPU programming then we will cover how ICICLE allows you to easily develop youR ZK computations to run across many GPUs.

-
 ## Approaches to Multi GPU programming

 There are many [different strategies](https://github.com/NVIDIA/multi-gpu-programming-models) available for implementing multi GPU, however, it can be split into two categories.

-### GPU Server approach 
+### GPU Server approach

 This approach usually involves a single or multiple CPUs opening threads to read / write from multiple GPUs. You can think about it as a scaled up HOST - Device model.

@@ -23,8 +22,7 @@ This approach usually involves a single or multiple CPUs opening threads to read

 This approach won't let us tackle larger computation sizes but it will allow us to compute multiple computations which we wouldn't be able to load onto a single GPU.

-For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel. 
-
+For example let's say that you had to compute two MSMs of size 2^26 on a 16GB VRAM GPU you would normally have to perform them asynchronously. However, if you double the number of GPUs in your system you can now run them in parallel.

 ### Inter GPU approach

@@ -32,18 +30,17 @@ This approach involves a more sophisticated approach to multi GPU computation. U

 This approach requires redesigning the algorithm at the software level to be compatible with splitting amongst devices. In some cases, to lower latency to a minimum, special inter GPU connections would be installed on a server to allow direct communication between multiple GPUs.

-
-# Writing ICICLE Code for Multi GPUs
+## Writing ICICLE Code for Multi GPUs

 The approach we have taken for the moment is a GPU Server approach; we assume you have a machine with multiple GPUs and you wish to run some computation on each GPU.

 To dive deeper and learn about the API check out the docs for our different ICICLE API

 - [Rust Multi GPU APIs](./rust-bindings/multi-gpu.md)
+- [Golang Multi GPU APIs](./golang-bindings/multi-gpu.md)
 - C++ Multi GPU APIs

-
-## Best practices 
+## Best practices

 - Never hardcode device IDs, if you want your software to take advantage of all GPUs on a machine use methods such as `get_device_count` to support arbitrary number of GPUs.

@@ -57,7 +54,7 @@ Multi GPU support should work with ZK-Containers by simply defining which device
 docker run -it --gpus '"device=0,2"' zk-container-image
 ```

-If you wish to expose all GPUs 
+If you wish to expose all GPUs

 ```sh
 docker run --gpus all zk-container-image
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -2,10 +2,6 @@

 [![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)

-
-
-
-
 [ICICLE](https://github.com/ingonyama-zk/icicle) is a cryptography library for ZK using GPUs. ICICLE implements blazing fast cryptographic primitives such as EC operations, MSM, NTT, Poseidon hash and more on GPU.

 ICICLE allows developers with minimal GPU experience to effortlessly accelerate their ZK application; from our experiments, even the most naive implementation may yield 10X improvement in proving times.
@@ -17,28 +13,26 @@ ICICLE has been used by many leading ZK companies such as [Celer Network](https:
 We understand that not all developers have access to a GPU and we don't want this to limit anyone from developing with ICICLE.
 Here are some ways we can help you gain access to GPUs:

+:::note
+
+If none of the following options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to help you.
+
+:::
+
 ### Grants

 At Ingonyama we are interested in accelerating the progress of ZK and cryptography. If you are an engineer, developer or an academic researcher we invite you to checkout [our grant program](https://www.ingonyama.com/blog/icicle-for-researchers-grants-challenges). We will give you access to GPUs and even pay you to do your dream research!

 ### Google Colab

-This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance, it's acquired with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.
+This is a great way to get started with ICICLE instantly. Google Colab offers free GPU access to a NVIDIA T4 instance with 16 GB of memory which should be enough for experimenting and even prototyping with ICICLE.

 For an extensive guide on how to setup Google Colab with ICICLE refer to [this article](./colab-instructions.md).

-If none of these options are appropriate for you reach out to us on [telegram](https://t.me/RealElan) we will do our best to help you.
-
 ### Vast.ai

 [Vast.ai](https://vast.ai/) is a global GPU marketplace where you can rent many different types of GPUs by the hour for [competitive pricing](https://vast.ai/pricing). They provide on-demand and interruptible rentals depending on your need or use case; you can learn more about their rental types [here](https://vast.ai/faq#rental-types).

-:::note
-
-If none of these options suit your needs, contact us on [telegram](https://t.me/RealElan) for assistance. We're committed to ensuring that a lack of a GPU doesn't become a bottleneck for you. If you need help with setup or any other issues, we're here to do our best to help you.
-
-:::
-
 ## What can you do with ICICLE?

 [ICICLE](https://github.com/ingonyama-zk/icicle) can be used in the same way you would use any other cryptography library. While developing and integrating ICICLE into many proof systems, we found some use case categories:
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -7,6 +7,7 @@ The Polynomial API offers a robust framework for polynomial operations within a
 ## Key Features

 ### Backend Agnostic Architecture
+
 Our API is structured to be independent of any specific computational backend. While a CUDA backend is currently implemented, the architecture facilitates easy integration of additional backends. This capability allows users to perform polynomial operations without the need to tailor their code to specific hardware, enhancing code portability and scalability.

 ### Templating in the Polynomial API
@@ -27,15 +28,19 @@ In this template:
 - **`Image`**: Defines the type of the output values of the polynomial. This is typically the same as the coefficients.

 #### Default instantiation
+
 ```cpp
 extern template class Polynomial<scalar_t>;
 ```

 #### Extended use cases
+
 The templated nature of the Polynomial API also supports more complex scenarios. For example, coefficients and images could be points on an elliptic curve (EC points), which are useful in cryptographic applications and advanced algebraic structures. This approach allows the API to be extended easily to support new algebraic constructions without modifying the core implementation.

 ### Supported Operations
+
 The Polynomial class encapsulates a polynomial, providing a variety of operations:
+
 - **Construction**: Create polynomials from coefficients or evaluations on roots-of-unity domains.
 - **Arithmetic Operations**: Perform addition, subtraction, multiplication, and division.
 - **Evaluation**: Directly evaluate polynomials at specific points or across a domain.
@@ -47,6 +52,7 @@ The Polynomial class encapsulates a polynomial, providing a variety of operation
 This section outlines how to use the Polynomial API in C++. Bindings for Rust and Go are detailed under the Bindings sections.

 ### Backend Initialization
+
 Initialization with an appropriate factory is required to configure the computational context and backend.

 ```cpp
@@ -57,10 +63,12 @@ Initialization with an appropriate factory is required to configure the computat
 Polynomial::initialize(std::make_shared<CUDAPolynomialFactory>());
 ```

-:::note Icicle is built to a library per field/curve. Initialization must be done per library. That is, applications linking to multiple curves/fields should do it per curve/field.
+:::note
+Initialization of a factory must be done per linked curve or field.
 :::

 ### Construction
+
 Polynomials can be constructed from coefficients, from evaluations on roots-of-unity domains, or by cloning existing polynomials.

 ```cpp
@@ -80,10 +88,11 @@ auto p_cloned = p.clone(); // p_cloned and p do not share memory
 ```

 :::note
-The coefficients or evaluations may be allocated either on host or device memory. In both cases the memory is copied to backend device.
+The coefficients or evaluations may be allocated either on host or device memory. In both cases the memory is copied to the backend device.
 :::

 ### Arithmetic
+
 Constructed polynomials can be used for various arithmetic operations:

 ```cpp
@@ -105,7 +114,8 @@ Polynomial operator%(const Polynomial& rhs) const; // returns remainder R(x)
 Polynomial divide_by_vanishing_polynomial(uint64_t degree) const; // sdivision by the vanishing polynomial V(x)=X^N-1
 ```

-#### Example:
+#### Example
+
 Given polynomials A(x),B(x),C(x) and V(x) the vanishing polynomial.

 $$
@@ -117,12 +127,14 @@ auto H = (A*B-C).divide_by_vanishing_polynomial(N);
 ```

 ### Evaluation
-Evaluate polynomials at arbitrary domain points or across a domain.
+
+Evaluate polynomials at arbitrary domain points, across a domain or on a roots-of-unity domain.

 ```cpp
 Image operator()(const Domain& x) const; // evaluate f(x)
 void evaluate(const Domain* x, Image* evals /*OUT*/) const;
 void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory
 ```

 Example:
@@ -136,21 +148,19 @@ uint64_t domain_size = ...;
 auto domain = /*build domain*/; // host or device memory
 auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
 f.evaluate_on_domain(domain, domain_size, evaluations);
+
+// evaluate f(x) on roots of unity domain
+uint64_t domain_log_size = ...;
+auto evaluations_rou_domain = std::make_unique<scalar_t[]>(1 << domain_log_size); // can be device memory too
+f.evaluate_on_rou_domain(domain_log_size, evaluations_rou_domain);
 ```

-:::note For special domains such as roots of unity this method is not the most efficient for two reasons:
- Need to build the domain of size N.
- The implementation is not trying to identify this special domain.
-
-Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
-See the 'device views' section for more details.
-:::
-
-
 ### Manipulations
+
 Beyond arithmetic, the API supports efficient polynomial manipulations:

 #### Monomials
+
 ```cpp
 // Monomial operations
 Polynomial& add_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
@@ -160,31 +170,35 @@ Polynomial& sub_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
 The ability to add or subtract monomials directly and in-place is an efficient way to manipualte polynomials.

 Example:
+
 ```cpp
 f.add_monomial_in_place(scalar_t::from(5)); // f(x) += 5
 f.sub_monomial_in_place(scalar_t::from(3), 8); // f(x) -= 3x^8
 ```

 #### Computing the degree of a Polynomial
+
 ```cpp
 // Degree computation
 int64_t degree();
 ```

 The degree of a polynomial is a fundamental characteristic that describes the highest power of the variable in the polynomial expression with a non-zero coefficient.
-The `degree()` function in the API returns the degree of the polynomial, corresponding to the highest exponent with a non-zero coefficient. 
+The `degree()` function in the API returns the degree of the polynomial, corresponding to the highest exponent with a non-zero coefficient.

 - For the polynomial $f(x) = x^5 + 2x^3 + 4$, the degree is 5 because the highest power of $x$ with a non-zero coefficient is 5.
 - For a scalar value such as a constant term (e.g., $f(x) = 7$, the degree is considered 0, as it corresponds to $x^0$.
 - The degree of the zero polynomial, $f(x) = 0$, where there are no non-zero coefficients, is defined as -1. This special case often represents an "empty" or undefined state in many mathematical contexts.

 Example:
+
 ```cpp
 auto f = /*some expression*/;
 auto degree_of_f = f.degree();
 ```

 #### Slicing
+
 ```cpp
 // Slicing and selecting even or odd components.
 Polynomial slice(uint64_t offset, uint64_t stride, uint64_t size = 0 /*0 means take all elements*/);
@@ -195,6 +209,7 @@ Polynomial odd();
 The Polynomial API provides methods for slicing polynomials and selecting specific components, such as even or odd indexed terms. Slicing allows extracting specific sections of a polynomial based on an offset, stride, and size.

 The following examples demonstrate folding a polynomial's even and odd parts and arbitrary slicing;
+
 ```cpp
 // folding a polynomials even and odd parts with randomness
 auto x = rand();
@@ -207,13 +222,15 @@ auto first_quarter = f.slice(0 /*offset*/, 1 /*stride*/, f.degree()/4 /*size*/);
 ```

 ### Memory access (copy/view)
-Access to the polynomial's internal state can be vital for operations like commitment schemes or when more efficient custom operations are necessary. This can be done in one of two ways:
- **Copy** the coefficients or evaluations to user allocated memory or
- **View** into the device memory without copying.

-#### Copy
-Copy the polynomial coefficients to either host or device allocated memory.
-:::note copying to host memory is backend agnostic while copying to device memory requires the memory to be allocated on the corresponding backend.
+Access to the polynomial's internal state can be vital for operations like commitment schemes or when more efficient custom operations are necessary. This can be done either by copying or viewing the polynomial
+
+#### Copying
+
+Copies the polynomial coefficients to either host or device allocated memory.
+
+:::note
+Copying to host memory is backend agnostic while copying to device memory requires the memory to be allocated on the corresponding backend.
 :::

 ```cpp
@@ -222,6 +239,7 @@ uint64_t copy_coeffs(Coeff* coeffs, uint64_t start_idx, uint64_t end_idx) const;
 ```

 Example:
+
 ```cpp
 auto coeffs_device = /*allocate CUDA or host memory*/
 f.copy_coeffs(coeffs_device, 0/*start*/, f.degree());
@@ -232,7 +250,8 @@ auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);
 ```

 #### Views
-The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations, without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
+
+The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.

 ##### What is a Memory View?

@@ -242,7 +261,7 @@ A memory view is essentially a pointer to data stored in device memory. By provi

 Memory views are extremely versatile and can be employed in various computational contexts such as:

- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
+- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM).
 - **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.

 ##### Obtaining and Using Views
@@ -252,9 +271,6 @@ To create and use views within the Polynomial API, functions are provided to obt
 ```cpp
 // Obtain a view of the polynomial's coefficients
 std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
-std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
 ```

 Example usage:
@@ -268,6 +284,7 @@ gpu_accelerated_function(coeffs_view.get(),...);
 ```

 ##### Integrity-Pointer: Managing Memory Views
+
 Within the Polynomial API, memory views are managed through a specialized tool called the Integrity-Pointer. This pointer type is designed to safeguard operations by monitoring the validity of the memory it points to. It can detect if the memory has been modified or released, thereby preventing unsafe access to stale or non-existent data.
 The Integrity-Pointer not only acts as a regular pointer but also provides additional functionality to ensure the integrity of the data it references. Here are its key features:

@@ -304,20 +321,7 @@ if (coeff_view.isValid()) {
 }
 ```

-#### Evaluations View: Accessing Polynomial Evaluations Efficiently
-The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
-This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
-:::warning
-Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
-:::

-```cpp
-// Assume a polynomial `p` of degree N
-auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
-
-// Use the evaluations view to perform further computations or visualizations
-process_polynomial_evaluations(evals_view.get(), size, device_id);
-```

 ## Multi-GPU Support with CUDA Backend

@@ -334,7 +338,9 @@ cudaSetDevice(int deviceID);
 This function sets the active CUDA device. All subsequent operations that allocate or deal with polynomial data will be performed on this device.

 ### Allocation Consistency
+
 Polynomials are always allocated on the current CUDA device at the time of their creation. It is crucial to ensure that the device context is correctly set before initiating any operation that involves memory allocation:
+
 ```cpp
 // Set the device before creating polynomials
 cudaSetDevice(0);
@@ -345,6 +351,7 @@ Polynomial p2 = Polynomial::from_coefficients(coeffs, size);
 ```

 ### Matching Devices for Operations
+
 When performing operations that result in the creation of new polynomials (such as addition or multiplication), it is imperative that both operands are on the same CUDA device. If the operands reside on different devices, an exception is thrown:

 ```cpp
@@ -354,7 +361,9 @@ auto p3 = p1 + p2; // Throws an exception if p1 and p2 are not on the same devic
 ```

 ### Device-Agnostic Operations
+
 Operations that do not involve the creation of new polynomials, such as computing the degree of a polynomial or performing in-place modifications, can be executed regardless of the current device setting:
+
 ```cpp
 // 'degree' and in-place operations do not require device matching
 int deg = p1.degree();
@@ -362,9 +371,11 @@ p1 += p2; // Valid if p1 and p2 are on the same device, throws otherwise
 ```

 ### Error Handling
+
 The API is designed to throw exceptions if operations are attempted across polynomials that are not located on the same GPU. This ensures that all polynomial operations are performed consistently and without data integrity issues due to device mismatches.

 ### Best Practices
+
 To maximize the performance and avoid runtime errors in a multi-GPU setup, always ensure that:

 - The CUDA device is set correctly before polynomial allocation.
--- a/docs/docs/icicle/primitives/keccak.md
+++ b/docs/docs/icicle/primitives/keccak.md
@@ -0,0 +1,22 @@
+# Keccak
+
+[Keccak](https://keccak.team/files/Keccak-implementation-3.2.pdf) is a cryptographic hash function designed by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van Assche. It was selected as the winner of the NIST hash function competition, becoming the basis for the [SHA-3 standard](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf).
+
+Keccak operates on a message input of any length and produces a fixed-size hash output. The hash function is built upon the sponge construction, which involves absorbing the input data followed by squeezing out the hash value.
+
+At its core, Keccak consists of a permutation function operating on a state array. The permutation function employs a round function that operates iteratively on the state array. Each round consists of five main steps:
+
+- **Theta:** This step introduces diffusion by performing a bitwise XOR operation between the state and a linear combination of its neighboring columns.
+- **Rho:** This step performs bit rotation operations on each lane of the state array.
+- **Pi:** This step rearranges the positions of the lanes in the state array.
+- **Chi:** This step applies a nonlinear mixing operation to each lane of the state array.
+- **Iota:** This step introduces a round constant to the state array.
+
+## Using Keccak
+
+ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree.
+
+### Supported Bindings
+
+- [Golang](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/hash/keccak)
+- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -49,13 +49,6 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per

 You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).

-## Supported curves
-
-MSM supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-
 ## Supported Bindings

 - [Golang](../golang-bindings/msm.md)
@@ -81,16 +74,16 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu

 #### When should I use Large triangle accumulation?

-The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.
+The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a **large batch of smaller MSM computations**.

 ## MSM Modes

 ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`

-Batch MSM allows you to run many MSMs with a single API call, Single MSM will launch a single MSM computation.
+Batch MSM allows you to run many MSMs with a single API call while single MSM will launch a single MSM computation.

 ### Which mode should I use?

-This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.
+This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs simultaneously, leveraging the parallel processing capabilities of GPUs.

 Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -11,24 +11,19 @@ A_k = \sum_{n=0}^{N-1} a_n \cdot \omega^{nk} \mod p
 $$

 where:
+
 - $N$ is the size of the input sequence and is a power of 2,
 - $p$ is a prime number such that $p = kN + 1$ for some integer $k$, ensuring that $p$ supports the existence of $N$th roots of unity,
 - $\omega$ is a primitive $N$th root of unity modulo $p$, meaning $\omega^N \equiv 1 \mod p$ and no smaller positive power of $\omega$ is congruent to 1 modulo $p$,
 - $k$ ranges from 0 to $N-1$, and it indexes the output sequence.

-The NTT is particularly useful because it enables efficient polynomial multiplication under modulo arithmetic, crucial for algorithms in cryptographic protocols, and other areas requiring fast modular arithmetic operations. 
+NTT is particularly useful because it enables efficient polynomial multiplication under modulo arithmetic, crucial for algorithms in cryptographic protocols and other areas requiring fast modular arithmetic operations.

 There exists also INTT which is the inverse operation of NTT. INTT can take as input an output sequence of integers from an NTT and reconstruct the original sequence.

-# Using NTT
+## Using NTT

-### Supported curves
-
-NTT supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Supported Bindings
+### Supported Bindings

 - [Golang](../golang-bindings/ntt.md)
 - [Rust](../rust-bindings/ntt.md)
@@ -61,19 +56,17 @@ Choosing an algorithm is heavily dependent on your use case. For example Cooley-

 NTT also supports two different modes `Batch NTT` and `Single NTT`

-Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.
-
 Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.

-**Single NTT Mode**
+#### Single NTT

- Choose this mode when your application requires processing individual NTT operations in isolation.
+Single NTT will launch a single NTT computation.

-**Batch NTT Mode**
+Choose this mode when your application requires processing individual NTT operations in isolation.

- Batch NTT mode can significantly reduce read/write as well as computation overhead by executing multiple NTT operations in parallel.
+#### Batch NTT Mode

- Batch mode may also offer better utilization of computational resources (memory and compute).
+Batch NTT allows you to run many NTTs with a single API call. Batch NTT mode can significantly reduce read/write times as well as computation overhead by executing multiple NTT operations in parallel. Batch mode may also offer better utilization of computational resources (memory and compute).

 ## Supported algorithms

@@ -90,8 +83,8 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob
   The algorithm recursively divides the input sequence into smaller sequences. At each step, it separates the sequence into even-indexed and odd-indexed elements, forming two subsequences that are then processed independently.

 3. **Butterfly Operations:**
-   The core computational element of the Radix-2 NTT is the "butterfly" operation, which combines pairs of elements from the sequences obtained in the decomposition step. 
-   
+   The core computational element of the Radix-2 NTT is the "butterfly" operation, which combines pairs of elements from the sequences obtained in the decomposition step.
+
   Each butterfly operation involves multiplication by a "twiddle factor," which is a root of unity in the finite field, and addition or subtraction of the results, all performed modulo the prime modulus.

   $$
@@ -108,7 +101,6 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob

   $k$ - The index of the current operation within the butterfly or the transform stage

-
   The twiddle factors are precomputed to save runtime and improve performance.

 4. **Bit-Reversal Permutation:**
@@ -116,7 +108,7 @@ At its core, the Radix-2 NTT algorithm divides the problem into smaller sub-prob

 ### Mixed Radix

-The Mixed Radix NTT algorithm extends the concepts of the Radix-2 algorithm by allowing the decomposition of the input sequence based on various factors of its length. Specifically ICICLEs implementation splits the input into blocks of sizes 16,32,64 compared to radix2 which is always splitting such that we end with NTT of size 2. This approach offers enhanced flexibility and efficiency, especially for input sizes that are composite numbers, by leveraging the "divide and conquer" strategy across multiple radixes.
+The Mixed Radix NTT algorithm extends the concepts of the Radix-2 algorithm by allowing the decomposition of the input sequence based on various factors of its length. Specifically ICICLEs implementation splits the input into blocks of sizes 16, 32, or 64 compared to radix2 which is always splitting such that we end with NTT of size 2. This approach offers enhanced flexibility and efficiency, especially for input sizes that are composite numbers, by leveraging the "divide and conquer" strategy across multiple radices.

 The NTT blocks in Mixed Radix are implemented more efficiently based on winograd NTT but also optimized memory and register usage is better compared to Radix-2.

@@ -126,11 +118,11 @@ Mixed Radix can reduce the number of stages required to compute for large inputs
   The input to the Mixed Radix NTT is a sequence of integers $a_0, a_1, \ldots, a_{N-1}$, where $N$ is not strictly required to be a power of two. Instead, $N$ can be any composite number, ideally factorized into primes or powers of primes.

 2. **Factorization and Decomposition:**
-   Unlike the Radix-2 algorithm, which strictly divides the computational problem into halves, the Mixed Radix NTT algorithm implements a flexible decomposition approach which isn't limited to prime factorization. 
-   
+   Unlike the Radix-2 algorithm, which strictly divides the computational problem into halves, the Mixed Radix NTT algorithm implements a flexible decomposition approach which isn't limited to prime factorization.
+
   For example, an NTT of size 256 can be decomposed into two stages of $16 \times \text{NTT}_{16}$, leveraging a composite factorization strategy rather than decomposing into eight stages of $\text{NTT}_{2}$. This exemplifies the use of composite factors (in this case, $256 = 16 \times 16$) to apply smaller NTT transforms, optimizing computational efficiency by adapting the decomposition strategy to the specific structure of $N$.

-3. **Butterfly Operations with Multiple Radixes:**
+3. **Butterfly Operations with Multiple Radices:**
   The Mixed Radix algorithm utilizes butterfly operations for various radix sizes. Each sub-transform involves specific butterfly operations characterized by multiplication with twiddle factors appropriate for the radix in question.

   The generalized butterfly operation for a radix-$r$ element can be expressed as:
@@ -139,7 +131,15 @@ Mixed Radix can reduce the number of stages required to compute for large inputs
   X_{k,r} = \sum_{j=0}^{r-1} (A_{j,k} \cdot W^{jk}) \mod p
   $$

-   where $X_{k,r}$ is the output of the $radix-r$ butterfly operation for the $k-th$ set of inputs, $A_{j,k}$ represents the $j-th$ input element for the $k-th$ operation, $W$ is the twiddle factor, and $p$ is the prime modulus.
+   where:
+
+   $X_{k,r}$ - is the output of the $radix-r$ butterfly operation for the $k-th$ set of inputs
+
+   $A_{j,k}$ - represents the $j-th$ input element for the $k-th$ operation
+
+   $W$ - is the twiddle factor
+
+   $p$ - is the prime modulus

 4. **Recombination and Reordering:**
   After applying the appropriate butterfly operations across all decomposition levels, the Mixed Radix algorithm recombines the results into a single output sequence. Due to the varied sizes of the sub-transforms, a more complex reordering process may be required compared to Radix-2. This involves digit-reversal permutations to ensure that the final output sequence is correctly ordered.
@@ -154,6 +154,6 @@ Mixed radix on the other hand works better for larger NTTs with larger input siz

 Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

-For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/774250926c00ffe84548bc7dd97aea5227afed7e/icicle/appUtils/ntt/ntt.cu#L474) which should choose the most efficient algorithm in most cases. 
+For this reason we implemented our [heuristic auto-selection](https://github.com/ingonyama-zk/icicle/blob/main/icicle/src/ntt/ntt.cu#L573) which should choose the most efficient algorithm in most cases.

 We still recommend you benchmark for your specific use case if you think a different configuration would yield better results.
--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -8,4 +8,5 @@ This section of the documentation is dedicated to the ICICLE primitives, we will

 - [MSM](./msm.md)
 - [NTT](./ntt.md)
+- [Keccak Hash](./keccak.md)
 - [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -8,43 +8,42 @@ Poseidon has been used in many popular ZK protocols such as Filecoin and [Plonk]

 Our implementation of Poseidon is implemented in accordance with the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/).

-Let understand how Poseidon works.
+Lets understand how Poseidon works.

-### Initialization
+## Initialization

-Poseidon starts with the initialization of its internal state, which is composed of the input elements and some pregenerated constants. An initial round constant is added to each element of the internal state. Adding The round constants ensure the state is properly mixed from the outset.
+Poseidon starts with the initialization of its internal state, which is composed of the input elements and some pre-generated constants. An initial round constant is added to each element of the internal state. Adding the round constants ensures the state is properly mixed from the beginning.

 This is done to prevent collisions and to prevent certain cryptographic attacks by ensuring that the internal state is sufficiently mixed and unpredictable.

-![Alt text](image.png)
+![Poseidon initialization of internal state added with pre-generated round constants](https://github.com/ingonyama-zk/icicle/assets/122266060/52257f5d-6097-47c4-8f17-7b6449b9d162)

-### Applying full and partial rounds
+## Applying full and partial rounds

-To generate a secure hash output, the algorithm goes through a series of "full rounds" and "partial rounds" as well as transformations between these sets of rounds.
+To generate a secure hash output, the algorithm goes through a series of "full rounds" and "partial rounds" as well as transformations between these sets of rounds in the following order:

-First full rounds => apply SBox and Round constants => partial rounds => Last full rounds => Apply SBox
+```First full rounds -> apply S-box and Round constants -> partial rounds -> Last full rounds -> Apply S-box```

-#### Full rounds
+### Full rounds

-![Alt text](image-1.png)
+![Full round iterations consisting of S box operations, adding round constants, and a Full MDS matrix multiplication](https://github.com/ingonyama-zk/icicle/assets/122266060/e4ce0e98-b90b-4261-b83e-3cd8cce069cb)

-**Uniform Application of S-Box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`, `α` can be different depending on the the implementation and user configuration.
+**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the implementation and user configuration.

 **Linear Transformation:** After applying the S-box, a linear transformation is performed on the state. This involves multiplying the state by a MDS (Maximum Distance Separable) Matrix. which further diffuses the transformations applied by the S-box across the entire state.

 **Addition of Round Constants:** Each element of the state is then modified by adding a unique round constant. These constants are different for each round and are precomputed as part of the hash function's initialization. The addition of round constants ensures that even minor changes to the input produce significant differences in the output.

-#### Partial Rounds
+### Partial Rounds
+
+![Partial round iterations consisting of selective S box operation, adding a round constant and performing an MDS multiplication with a sparse matrix](https://github.com/ingonyama-zk/icicle/assets/122266060/e8c198b4-7aa4-4b4d-9ec4-604e39e07692)

 **Selective Application of S-Box:** Partial rounds apply the S-box transformation to only one element of the internal state per round, rather than to all elements. This selective application significantly reduces the computational complexity of the hash function without compromising its security. The choice of which element to apply the S-box to can follow a specific pattern or be fixed, depending on the design of the hash function.

 **Linear Transformation and Round Constants:** A linear transformation is performed and round constants are added. The linear transformation in partial rounds can be designed to be less computationally intensive (this is done by using a sparse matrix) than in full rounds, further optimizing the function's efficiency.

-
 The user of Poseidon can often choose how many partial or full rounds he wishes to apply; more full rounds will increase security but degrade performance. The choice and balance is highly dependent on the use case.

-![Alt text](image-2.png)
-
 ## Using Poseidon

 ICICLE Poseidon is implemented for GPU and parallelization is performed for each element of the state rather than for each state.
@@ -52,25 +51,20 @@ What that means is we calculate multiple hash-sums over multiple pre-images in p

 So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elements of output. Which means each block would be of size 2 and that would result in 1024 Poseidon hashes being performed.

-### Supported API
+### Supported Bindings

-[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon), [`C++`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon)
-
-### Supported curves
-
-Poseidon supports the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
+[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)

 ### Constants

 Poseidon is extremely customizable and using different constants will produce different hashes, security levels and performance results.

-We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.
+We support pre-calculated and optimized constants for each of the [supported curves](../core#supported-curves-and-operations).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.

-If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/blob/b6dded89cdef18348a5d4e2748b71ce4211c63ad/icicle/appUtils/poseidon/constants/generate_parameters.py#L1).
+If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants/generate_parameters.py).

 Prerequisites:
+
 - Install python 3
 - `pip install poseidon-hash`
 - `pip install galois==0.3.7`
@@ -97,7 +91,7 @@ primitive_element = 7 # bls12-381
 # primitive_element = 15 # bw6-761
 ```

-We only support `alpha = 5` so if you want to use another alpha for SBox please reach out on discord or open a github issue.
+We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.

 ### Rust API

@@ -128,8 +122,7 @@ poseidon_hash_many::<F>(

 The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.

-
-```
+```rust
 impl<'a> Default for PoseidonConfig<'a> {
    fn default() -> Self {
        let ctx = get_default_device_context();
@@ -174,11 +167,10 @@ let ctx = get_default_device_context();
    )
    .unwrap();
 ```
-For more examples using different configurations refer here.

 ## The Tree Builder

-The tree builder allows you to build Merkle trees using Poseidon. 
+The tree builder allows you to build Merkle trees using Poseidon.

 You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.

@@ -206,9 +198,9 @@ Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig:
 - `are_inputs_on_device`: Have the inputs been loaded to device memory ?
 - `is_async`: Should the TreeBuilder run asynchronously? `False` will block the current CPU thread. `True` will require you call `cudaStreamSynchronize` or `cudaDeviceSynchronize` to retrieve the result.

-### Benchmarks 
+### Benchmarks

-We ran the Poseidon tree builder on: 
+We ran the Poseidon tree builder on:

 **CPU**: 12th Gen Intel(R) Core(TM) i9-12900K/

@@ -218,9 +210,8 @@ We ran the Poseidon tree builder on:

 The benchmarks include copying data from and to the device.

-
 | Rows to keep parameter      | Run time, Icicle | Supranational PC2
-| ----------- | ----------- | ----------- |  
+| ----------- | ----------- | -----------
 | 10          | 9.4 seconds       |    13.6 seconds
 | 20          | 9.5 seconds       |    13.6 seconds
 | 29          | 13.7 seconds       |    13.6 seconds
--- a/docs/docs/icicle/rust-bindings.md
+++ b/docs/docs/icicle/rust-bindings.md
@@ -12,7 +12,7 @@ Rust bindings allow you to use ICICLE as a rust library.

 Simply add the following to your `Cargo.toml`.

-```
+```toml
 # GPU Icicle integration
 icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git" }
 icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git" }
@@ -25,7 +25,7 @@ If you wish to point to a specific ICICLE branch add `branch = "<name_of_branch>

 When you build your project ICICLE will be built as part of the build command.

-# How do the rust bindings work?
+## How do the rust bindings work?

 The rust bindings are just rust wrappers for ICICLE Core static libraries which can be compiled. We integrate the compilation of the static libraries into rusts toolchain to make usage seamless and easy. This is achieved by [extending rusts build command](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs).

@@ -55,3 +55,33 @@ fn main() {
    println!("cargo:rustc-link-lib=cudart");
 }
 ```
+
+## Supported curves, fields and operations
+
+### Supported curves and operations
+
+| Operation\Curve | bn254 | bls12_377 | bls12_381 | bw6-761 | grumpkin |
+| --- | :---: | :---: | :---: | :---: | :---: |
+| MSM | ✅ | ✅ | ✅ | ✅ | ✅ |
+| G2  | ✅ | ✅ | ✅ | ✅ | ❌ |
+| NTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| ECNTT | ✅ | ✅ | ✅ | ✅ | ❌ |
+| VecOps | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Polynomials | ✅ | ✅ | ✅ | ✅ | ❌ |
+| Poseidon | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Merkle Tree | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+### Supported fields and operations
+
+| Operation\Field | babybear | stark252 |
+| --- | :---: | :---: |
+| VecOps | ✅ | ✅ |
+| Polynomials | ✅ | ✅ |
+| NTT | ✅ | ✅ |
+| Extension Field | ✅ | ❌ |
+
+### Supported hashes
+
+| Hash | Sizes |
+| --- | :---: |
+| Keccak | 256, 512 |
--- a/docs/docs/icicle/rust-bindings/ecntt.md
+++ b/docs/docs/icicle/rust-bindings/ecntt.md
@@ -1,9 +1,5 @@
 # ECNTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`
-
 ## ECNTT Method

 The `ecntt` function computes the Elliptic Curve Number Theoretic Transform (EC-NTT) or its inverse on a batch of points of a curve.
@@ -25,7 +21,7 @@ where

 ## Parameters

- **`input`**: The input data as a slice of `Projective<C>`. This represents points on a specific elliptic curve `C`. 
+- **`input`**: The input data as a slice of `Projective<C>`. This represents points on a specific elliptic curve `C`.
 - **`dir`**: The direction of the NTT. It can be `NTTDir::kForward` for forward NTT or `NTTDir::kInverse` for inverse NTT.
 - **`cfg`**: The NTT configuration object of type `NTTConfig<C::ScalarField>`. This object specifies parameters for the NTT computation, such as the batch size and algorithm to use.
 - **`output`**: The output buffer to write the results into. This should be a slice of `Projective<C>` with the same size as the input.
--- a/docs/docs/icicle/rust-bindings/keccak.md
+++ b/docs/docs/icicle/rust-bindings/keccak.md
@@ -0,0 +1,96 @@
+# Keccak
+
+## Keccak Example
+
+```rust
+use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
+use icicle_hash::keccak::{keccak256, KeccakConfig};
+use rand::{self, Rng};
+
+fn main() {
+    let mut rng = rand::thread_rng();
+    let initial_data: Vec<u8> = (0..120).map(|_| rng.gen::<u8>()).collect();
+    println!("initial data: {}", hex::encode(&initial_data));
+    let input = HostSlice::<u8>::from_slice(initial_data.as_slice());
+    let mut output = DeviceVec::<u8>::cuda_malloc(32).unwrap();
+
+    let mut config = KeccakConfig::default();
+    keccak256(input, initial_data.len() as i32, 1, &mut output[..], &mut config).expect("Failed to execute keccak256 hashing");
+
+    let mut output_host = vec![0_u8; 32];
+    output.copy_to_host(HostSlice::from_mut_slice(&mut output_host[..])).unwrap();
+
+    println!("keccak256 result: {}", hex::encode(&output_host));
+}
+```
+
+## Keccak Methods
+
+```rust
+pub fn keccak256(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: i32,
+    number_of_blocks: i32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &mut KeccakConfig,
+) -> IcicleResult<()>
+
+pub fn keccak512(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: i32,
+    number_of_blocks: i32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &mut KeccakConfig,
+) -> IcicleResult<()> 
+```
+
+### Parameters
+
+- **`input`**: A slice containing the input data for the Keccak256 hash function. It can reside in either host memory or device memory.
+- **`input_block_size`**: An integer specifying the size of the input data for a single hash.
+- **`number_of_blocks`**: An integer specifying the number of results in the hash batch.
+- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
+- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
+
+### Return Value
+
+- **`IcicleResult`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
+
+## KeccakConfig
+
+The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+
+```rust
+pub struct KeccakConfig<'a> {
+    pub ctx: DeviceContext<'a>,
+    pub are_inputs_on_device: bool,
+    pub are_outputs_on_device: bool,
+    pub is_async: bool,
+}
+```
+
+### Fields
+
+- **`ctx`**: Device context containing details like device id and stream.
+- **`are_inputs_on_device`**: Indicates if input data is located on the device.
+- **`are_outputs_on_device`**: Indicates if output hash is stored on the device.
+- **`is_async`**: If true, runs the Keccak256/Keccak512 operation asynchronously.
+
+### Usage
+
+Example initialization with default settings:
+
+```rust
+let default_config = KeccakConfig::default();
+```
+
+Customizing the configuration:
+
+```rust
+let custom_config = NTTConfig {
+    ctx: custom_device_context,
+    are_inputs_on_device: true,
+    are_outputs_on_device: true,
+    is_async: false,
+};
+```
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -2,11 +2,7 @@

 To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `Grumpkin`
-
-### `precompute_bases`
+## `precompute_bases`

 Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.

@@ -20,8 +16,7 @@ pub fn precompute_bases<C: Curve + MSM<C>>(
 ) -> IcicleResult<()>
 ```

-
-#### Parameters
+### Parameters

 - **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
 - **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -1,9 +1,5 @@
 # MSM

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
 ## Example

 ```rust
@@ -84,7 +80,7 @@ pub struct MSMConfig<'a> {
 ```

 - **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
- **`point_size: i32`**: 
+- **`point_size: i32`**:
 - **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
 - **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
 - **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
@@ -120,7 +116,6 @@ msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();

 You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).

-
 ## How do I toggle between MSM modes?

 Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
@@ -136,7 +131,6 @@ msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();

 In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.

-
 In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.

 ```rust
@@ -152,7 +146,7 @@ Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961

 ## Support for G2 group

-MSM also supports G2 group. 
+MSM also supports G2 group.

 Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.

--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -1,10 +1,6 @@
 # NTT

-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Example 
+## Example

 ```rust
 use icicle_bn254::curve::{ScalarCfg, ScalarField};
@@ -61,14 +57,13 @@ pub fn ntt<F>(

 `ntt:ntt` expects:

- **`input`** - buffer to read the inputs of the NTT from. <br/>
- **`dir`** - whether to compute forward or inverse NTT. <br/>
- **`cfg`** - config used to specify extra arguments of the NTT. <br/>
+- **`input`** - buffer to read the inputs of the NTT from.
+- **`dir`** - whether to compute forward or inverse NTT.
+- **`cfg`** - config used to specify extra arguments of the NTT.
 - **`output`** - buffer to write the NTT outputs into. Must be of the same  size as input.

 The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.

-
 ### NTT Config

 ```rust
@@ -107,8 +102,7 @@ The `NTTConfig` struct is a configuration object used to specify parameters for

 - **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
 `Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
+`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to

 #### Usage

@@ -134,7 +128,6 @@ let custom_config = NTTConfig {
 };
 ```

-
 ### Modes

 NTT supports two different modes `Batch NTT` and `Single NTT`
@@ -187,7 +180,7 @@ where

 - **`IcicleResult<()>`**: Will return an error if the operation fails.

-### Releaseing the domain
+### Releasing the domain

 The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.

@@ -205,4 +198,3 @@ where
 #### Returns

 The function returns an `IcicleResult<()>`, which represents the result of the operation. If the operation is successful, the function returns `Ok(())`, otherwise it returns an error.
-
--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -1,14 +1,16 @@
-:::note Please refer to the Polynomials overview page for a deep overview. This section is a brief description of the Rust FFI bindings. 
+# Rust FFI Bindings for Univariate Polynomial
+
+:::note
+Please refer to the Polynomials overview page for a deep overview. This section is a brief description of the Rust FFI bindings.
 :::

-# Rust FFI Bindings for Univariate Polynomial
 This documentation is designed to provide developers with a clear understanding of how to utilize the Rust bindings for polynomial operations efficiently and effectively, leveraging the robust capabilities of both Rust and C++ in their applications.

 ## Introduction
+
 The Rust FFI bindings for the Univariate Polynomial serve as a "shallow wrapper" around the underlying C++ implementation. These bindings provide a straightforward Rust interface that directly calls functions from a C++ library, effectively bridging Rust and C++ operations. The Rust layer handles simple interface translations without delving into complex logic or data structures, which are managed on the C++ side. This design ensures efficient data handling, memory management, and execution of polynomial operations directly via C++.
 Currently, these bindings are tailored specifically for polynomials where the coefficients, domain, and images are represented as scalar fields.

-
 ## Initialization Requirements

 Before utilizing any functions from the polynomial API, it is mandatory to initialize the appropriate polynomial backend (e.g., CUDA). Additionally, the NTT (Number Theoretic Transform) domain must also be initialized, as the CUDA backend relies on this for certain operations. Failing to properly initialize these components can result in errors.
@@ -19,12 +21,12 @@ Before utilizing any functions from the polynomial API, it is mandatory to initi
 The ICICLE library is structured such that each field or curve has its dedicated library implementation. As a result, initialization must be performed individually for each field or curve to ensure the correct setup and functionality of the library.
 :::

-
 ## Core Trait: `UnivariatePolynomial`

 The `UnivariatePolynomial` trait encapsulates the essential functionalities required for managing univariate polynomials in the Rust ecosystem. This trait standardizes the operations that can be performed on polynomials, regardless of the underlying implementation details. It allows for a unified approach to polynomial manipulation, providing a suite of methods that are fundamental to polynomial arithmetic.

 ### Trait Definition
+
 ```rust
 pub trait UnivariatePolynomial
 where
@@ -65,6 +67,9 @@ where
        evals: &mut E,
    );

+    // Method to evaluate the polynomial over the roots-of-unity domain for power-of-two sized domain
+    fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, domain_log_size: u64, evals: &mut E);
+
    // Method to retrieve a coefficient at a specific index.
    fn get_coeff(&self, idx: u64) -> Self::Field;

@@ -77,6 +82,7 @@ where
 ```

 ## `DensePolynomial` Struct
+
 The DensePolynomial struct represents a dense univariate polynomial in Rust, leveraging a handle to manage its underlying memory within the CUDA device context. This struct acts as a high-level abstraction over complex C++ memory management practices, facilitating the integration of high-performance polynomial operations through Rust's Foreign Function Interface (FFI) bindings.

 ```rust
@@ -88,15 +94,19 @@ pub struct DensePolynomial {
 ### Traits implementation and methods

 #### `Drop`
+
 Ensures proper resource management by releasing the CUDA memory when a DensePolynomial instance goes out of scope. This prevents memory leaks and ensures that resources are cleaned up correctly, adhering to Rust's RAII (Resource Acquisition Is Initialization) principles.

 #### `Clone`
+
 Provides a way to create a new instance of a DensePolynomial with its own unique handle, thus duplicating the polynomial data in the CUDA context. Cloning is essential since the DensePolynomial manages external resources, which cannot be safely shared across instances without explicit duplication.

 #### Operator Overloading: `Add`, `Sub`, `Mul`, `Rem`, `Div`
+
 These traits are implemented for references to DensePolynomial (i.e., &DensePolynomial), enabling natural mathematical operations such as addition (+), subtraction (-), multiplication (*), division (/), and remainder (%). This syntactic convenience allows users to compose complex polynomial expressions in a way that is both readable and expressive.

 #### Key Methods
+
 In addition to the traits, the following methods are implemented:

 ```rust
@@ -107,16 +117,16 @@ impl DensePolynomial {
 }      
 ```

-:::note Might be consolidated with `UnivariatePolynomial` trait
-:::
-
 ## Flexible Memory Handling With `HostOrDeviceSlice`
+
 The DensePolynomial API is designed to accommodate a wide range of computational environments by supporting both host and device memory through the `HostOrDeviceSlice` trait. This approach ensures that polynomial operations can be seamlessly executed regardless of where the data resides, making the API highly adaptable and efficient for various hardware configurations.

 ### Overview of `HostOrDeviceSlice`
+
 The HostOrDeviceSlice is a Rust trait that abstracts over slices of memory that can either be on the host (CPU) or the device (GPU), as managed by CUDA. This abstraction is crucial for high-performance computing scenarios where data might need to be moved between different memory spaces depending on the operations being performed and the specific hardware capabilities available.

 ### Usage in API Functions
+
 Functions within the DensePolynomial API that deal with polynomial coefficients or evaluations use the HostOrDeviceSlice trait to accept inputs. This design allows the functions to be agnostic of the actual memory location of the data, whether it's in standard system RAM accessible by the CPU or in GPU memory accessible by CUDA cores.

 ```rust
@@ -132,10 +142,13 @@ let p_from_evals = PolynomialBabyBear::from_rou_evals(&evals, evals.len());
 ```

 ## Usage
+
 This section outlines practical examples demonstrating how to utilize the `DensePolynomial` Rust API. The API is flexible, supporting multiple scalar fields. Below are examples showing how to use polynomials defined over different fields and perform a variety of operations.

 ### Initialization and Basic Operations
+
 First, choose the appropriate field implementation for your polynomial operations, initializing the CUDA backend if necessary
+
 ```rust
 use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;

@@ -151,10 +164,10 @@ use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
 ```

 ### Creation
+
 Polynomials can be created from coefficients or evaluations:

 ```rust
-// Assume F is the field type (e.g. icicle_bn254::curve::ScalarField or a type parameter)
 let coeffs = ...;
 let p_from_coeffs = PolynomialBabyBear::from_coeffs(HostSlice::from_slice(&coeffs), size);

@@ -164,6 +177,7 @@ let p_from_evals = PolynomialBabyBear::from_rou_evals(HostSlice::from_slice(&eva
 ```

 ### Arithmetic Operations
+
 Utilize overloaded operators for intuitive mathematical expressions:

 ```rust
@@ -174,6 +188,7 @@ let mul_scalar = &f * &scalar;  // Scalar multiplication
 ```

 ### Division and Remainder
+
 Compute quotient and remainder or perform division by a vanishing polynomial:

 ```rust
@@ -186,6 +201,7 @@ let h = f.div_by_vanishing(N);  // Division by V(x) = X^N - 1
 ```

 ### Monomial Operations
+
 Add or subtract monomials in-place for efficient polynomial manipulation:

 ```rust
@@ -194,6 +210,7 @@ f.sub_monomial_inplace(&one, 0 /*monmoial*/);   // Subtracts 1 from f
 ```

 ### Slicing
+
 Extract specific components:

 ```rust
@@ -203,6 +220,7 @@ let arbitrary_slice = f.slice(offset, stride, size);
 ```

 ### Evaluate
+
 Evaluate the polynoomial:

 ```rust
@@ -213,9 +231,15 @@ let f_x = f.eval(&x);  // Evaluate f at x
 let domain = [one, two, three];
 let mut host_evals = vec![ScalarField::zero(); domain.len()];
 f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
+
+// Evaluate on roots-of-unity-domain
+let domain_log_size = 4;
+let mut device_evals = DeviceVec::<ScalarField>::cuda_malloc(1 << domain_log_size).unwrap();
+f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
 ```

 ### Read coefficients
+
 Read or copy polynomial coefficients for further processing:

 ```rust
@@ -227,6 +251,7 @@ f.copy_coeffs(0, &mut device_mem[..]);
 ```

 ### Polynomial Degree
+
 Determine the highest power of the variable with a non-zero coefficient:

 ```rust
@@ -234,6 +259,7 @@ let deg = f.degree();  // Degree of the polynomial
 ```

 ### Memory Management: Views (rust slices)
+
 Rust enforces correct usage of views at compile time, eliminating the need for runtime checks:

 ```rust
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,13 +1,6 @@
 # Vector Operations API

-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. 
-
-
-## Supported curves
-
-Vector operations are supported on the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
+Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.

 ## Examples

@@ -59,7 +52,6 @@ let cfg = VecOpsConfig::default();
 mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
 ```

-
 ## Vector Operations Configuration

 The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
@@ -90,7 +82,7 @@ pub struct VecOpsConfig<'a> {

 `VecOpsConfig` can be initialized with default settings tailored for a specific device:

-```
+```rust
 let cfg = VecOpsConfig::default();
 ```

@@ -118,7 +110,7 @@ impl<'a> VecOpsConfig<'a> {

 ## Vector Operations

-Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
+Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.

 ### `VecOps` Trait

@@ -155,7 +147,6 @@ All operations are element-wise operations, and the results placed into the `res
 - **`sub`**: Computes the element-wise difference between two vectors.
 - **`mul`**: Performs element-wise multiplication of two vectors.

-
 ## MatrixTranspose API Documentation

 This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
@@ -186,8 +177,8 @@ where
 - **`column_size`**: The number of columns in the input matrix.
 - **`output`**: A mutable slice to store the transposed matrix. The slice can be stored on either the host or the device.
 - **`ctx`**: A reference to the `DeviceContext`, which provides information about the device where the operation will be performed.
- **`on_device`**: A boolean flag indicating whether the inputs and outputs are on the device. 
- **`is_async`**: A boolean flag indicating whether the operation should be performed asynchronously. 
+- **`on_device`**: A boolean flag indicating whether the inputs and outputs are on the device.
+- **`is_async`**: A boolean flag indicating whether the operation should be performed asynchronously.

 ### Return Value

@@ -209,9 +200,8 @@ transpose_matrix(&input, 5, 4, &mut output, &ctx, true, false)
    .expect("Failed to transpose matrix");
 ```

-
 The function takes a matrix represented as a 1D slice, transposes it, and stores the result in another 1D slice. The input and output slices can be stored on either the host or the device, and the operation can be performed synchronously or asynchronously.

 The function is generic and can work with any type `F` that implements the `FieldImpl` trait. The `<F as FieldImpl>::Config` type must also implement the `VecOps<F>` trait, which provides the `transpose` method used to perform the actual transposition.

-The function returns an `IcicleResult<()>`, indicating whether the operation was successful or not.
+The function returns an `IcicleResult<()>`, indicating whether the operation was successful or not.
--- a/docs/docs/introduction.md
+++ b/docs/docs/introduction.md
@@ -11,7 +11,7 @@ Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge
 Currently our flagship products are:

 - **ICICLE**:
-  [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your ZK existing protocols in a matter of hours or implement your protocol from scratch on GPU.
+  [ICICLE](https://github.com/ingonyama-zk/icicle) is a fully featured GPU accelerated cryptography library for building ZK provers. ICICLE allows you to accelerate your existing ZK protocols in a matter of hours or implement your protocol from scratch on GPU.

 ---

@@ -39,7 +39,7 @@ Learn more about ICICLE and GPUs [here][ICICLE-OVERVIEW].

 ## Get in Touch

-If you have any questions, ideas, or are thinking of building something in this space join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).
+If you have any questions, ideas, or are thinking of building something in this space, join the discussion on [Discord]. You can explore our code on [github](https://github.com/ingonyama-zk) or read some of [our research papers](https://github.com/ingonyama-zk/papers).

 Follow us on [Twitter](https://x.com/Ingo_zk) and [YouTube](https://www.youtube.com/@ingo_ZK) and sign up for our [mailing list](https://wkf.ms/3LKCbdj) to get our latest announcements.

--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -163,6 +163,7 @@ const config = {
      prism: {
        theme: lightCodeTheme,
        darkTheme: darkCodeTheme,
+        additionalLanguages: ['rust', 'go'],
      },
      image: 'img/logo.png',
    }),
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -3680,8 +3680,6 @@
      "version": "8.12.0",
      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-      "optional": true,
-      "peer": true,
      "dependencies": {
        "fast-deep-equal": "^3.1.1",
        "json-schema-traverse": "^1.0.0",
@@ -3696,9 +3694,7 @@
    "node_modules/ajv-formats/node_modules/json-schema-traverse": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "optional": true,
-      "peer": true
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
    },
    "node_modules/ajv-keywords": {
      "version": "3.5.2",
@@ -16344,13 +16340,14 @@
      "version": "2.1.1",
      "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz",
      "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==",
-      "requires": {},
+      "requires": {
+        "ajv": "^8.0.0"
+      },
      "dependencies": {
        "ajv": {
-          "version": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
+          "version": "8.12.0",
+          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
-          "optional": true,
-          "peer": true,
          "requires": {
            "fast-deep-equal": "^3.1.1",
            "json-schema-traverse": "^1.0.0",
@@ -16361,9 +16358,7 @@
        "json-schema-traverse": {
          "version": "1.0.0",
          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-          "optional": true,
-          "peer": true
+          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
        }
      }
    },
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -24,6 +24,47 @@ module.exports = {
          label: "ICICLE Core",
          id: "icicle/core",
        },
+        {
+          type: "category",
+          label: "Primitives",
+          link: {
+            type: `doc`,
+            id: 'icicle/primitives/overview',
+          },
+          collapsed: true,
+          items: [
+            {
+              type: "doc",
+              label: "MSM",
+              id: "icicle/primitives/msm",
+            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/primitives/ntt",
+            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/primitives/keccak",
+            },
+            {
+              type: "doc",
+              label: "Poseidon Hash",
+              id: "icicle/primitives/poseidon",
+            },
+          ],
+        },
+        {
+          type: "doc",
+          label: "Polynomials",
+          id: "icicle/polynomials/overview",
+        },
+        {
+          type: "doc",
+          label: "Multi GPU Support",
+          id: "icicle/multi-gpu",
+        },
        {
          type: "category",
          label: "Golang bindings",
@@ -64,6 +105,11 @@ module.exports = {
              label: "Vector operations",
              id: "icicle/golang-bindings/vec-ops",
            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/golang-bindings/keccak",
+            },
            {
              type: "doc",
              label: "Multi GPU Support",
@@ -111,6 +157,11 @@ module.exports = {
              label: "Vector operations",
              id: "icicle/rust-bindings/vec-ops",
            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/rust-bindings/keccak",
+            },
            {
              type: "doc",
              label: "Multi GPU Support",
@@ -123,42 +174,6 @@ module.exports = {
            },
          ],
        },
-        {
-          type: "category",
-          label: "Primitives",
-          link: {
-            type: `doc`,
-            id: 'icicle/primitives/overview',
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/primitives/msm",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/primitives/ntt",
-            },
-            {
-              type: "doc",
-              label: "Poseidon Hash",
-              id: "icicle/primitives/poseidon",
-            },
-          ],
-        },
-        {
-          type: "doc",
-          label: "Polynomials",
-          id: "icicle/polynomials/overview",
-        },
-        {
-          type: "doc",
-          label: "Multi GPU Support",
-          id: "icicle/multi-gpu",
-        },
        {
          type: "doc",
          label: "Google Colab Instructions",
@@ -190,6 +205,7 @@ module.exports = {
      type: "category",
      label: "Additional Resources",
      collapsed: false,
+      collapsible: false,
      items: [
        {
          type: "link",
--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -1,6 +1,6 @@
 # ZKContainer

-We recommend using [ZKContainer](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+We recommend using [ZKContainer](https://www.ingonyama.com/blog/product-announcement-zk-containers), where we have already preinstalled all the required dependencies, to run Icicle examples. 
 To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).

 In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`. 
--- a/examples/c++/best-practice-ntt/CMakeLists.txt
+++ b/examples/c++/best-practice-ntt/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/best-practice-ntt/README.md
+++ b/examples/c++/best-practice-ntt/README.md
@@ -0,0 +1,33 @@
+# ICICLE best practices: Concurrent Data Transfer and NTT Computation
+
+The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.
+
+## Key-Takeaway
+
+When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.
+
+Typically, you concurrently
+
+1. Download the output of a previous NTT back to the host
+2. Upload the input for a next NTT on the device
+3. Run current NTT
+
+> [!NOTE]
+> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
+
+## Best-Practices
+
+1. Use three separate CUDA streams for Download, Upload, and Compute operations
+2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
+3. Use in-place NTT to save on device memory.
+
+## Running the example
+
+To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
+
+```sh
+./compile.sh
+./run.sh
+```
+
+To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
--- a/examples/c++/best-practice-ntt/compile.sh
+++ b/examples/c++/best-practice-ntt/compile.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
+
--- a/examples/c++/best-practice-ntt/example.cu
+++ b/examples/c++/best-practice-ntt/example.cu
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <chrono>
+
+#include "curves/params/bn254.cuh"
+#include "api/bn254.h"
+using namespace bn254;
+using namespace ntt;
+
+const std::string curve = "BN254";
+
+typedef scalar_t S;
+typedef scalar_t E;
+
+const unsigned max_log_ntt_size = 27;
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
+    elements[i] = E::from(i + 1);
+  }
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char** argv)
+{
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId = 0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+
+  std::cout << "Curve: " << curve << std::endl;
+
+  S basic_root = S::omega(max_log_ntt_size);
+
+  // change these parameters to match the desired NTT size and batch size
+  const unsigned log_ntt_size = 22;
+  const unsigned nof_ntts = 16;
+
+  std::cout << "log NTT size: " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+
+  std::cout << "Batch size: " << nof_ntts << std::endl;
+
+  // Create separate CUDA streams for overlapping data transfers and kernel execution.
+  cudaStream_t stream_compute, stream_h2d, stream_d2h;
+  cudaStreamCreate(&stream_compute);
+  cudaStreamCreate(&stream_h2d);
+  cudaStreamCreate(&stream_d2h);
+
+  // Create device context for NTT computation
+  auto ctx_compute = device_context::DeviceContext{
+    stream_compute, // stream
+    0,              // device_id
+    0,              // mempool
+  };
+
+  // Initialize NTT domain and configuration
+  bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
+  NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
+  config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
+  config_compute.batch_size = nof_ntts;
+  config_compute.are_inputs_on_device = true;
+  config_compute.are_outputs_on_device = true;
+  config_compute.is_async = true;
+
+  std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
+  int nof_blocks = 32;
+  std::cout << "Number of blocks: " << nof_blocks << std::endl;
+  int block_size = ntt_size * nof_ntts / nof_blocks;
+
+  // on-host pinned data
+  E* h_inp[2];
+  E* h_out[2];
+  for (int i = 0; i < 2; i++) {
+    cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+  }
+
+  // on-device in-place data
+  // we need two on-device vectors to overlap data transfers with NTT kernel execution
+  E* d_vec[2];
+  for (int i = 0; i < 2; i++) {
+    cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
+  }
+
+  // initialize input data
+  initialize_input(ntt_size, nof_ntts, h_inp[0]);
+  initialize_input(ntt_size, nof_ntts, h_inp[1]);
+
+  cudaEvent_t compute_start, compute_stop;
+  cudaEventCreate(&compute_start);
+  cudaEventCreate(&compute_stop);
+
+  for (int run = 0; run < 10; run++) {
+    int vec_compute = run % 2;
+    int vec_transfer = (run + 1) % 2;
+    std::cout << "Run: " << run << std::endl;
+    std::cout << "Compute Vector: " << vec_compute << std::endl;
+    std::cout << "Transfer Vector: " << vec_transfer << std::endl;
+    START_TIMER(inplace);
+    cudaEventRecord(compute_start, stream_compute);
+    bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
+    cudaEventRecord(compute_stop, stream_compute);
+    // we have to delay upload to device relative to download from device by one block: preserve write after read
+    for (int i = 0; i <= nof_blocks; i++) {
+      if (i < nof_blocks) {
+        cudaMemcpyAsync(
+          &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
+          cudaMemcpyDeviceToHost, stream_d2h);
+      }
+      if (i > 0) {
+        cudaMemcpyAsync(
+          &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
+          sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
+      }
+      // synchronize upload and download at the end of the block to ensure data integrity
+      cudaStreamSynchronize(stream_d2h);
+      cudaStreamSynchronize(stream_h2d);
+    }
+    // synchronize compute stream with the end of the computation
+    cudaEventSynchronize(compute_stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
+    END_TIMER(inplace, "Concurrent In-Place  NTT");
+    std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
+  };
+
+  // Clean-up
+  for (int i = 0; i < 2; i++) {
+    cudaFree(d_vec[i]);
+    cudaFreeHost(h_inp[i]);
+    cudaFreeHost(h_out[i]);
+  }
+  cudaEventDestroy(compute_start);
+  cudaEventDestroy(compute_stop);
+  cudaStreamDestroy(stream_compute);
+  cudaStreamDestroy(stream_d2h);
+  cudaStreamDestroy(stream_h2d);
+  return 0;
+}
--- a/examples/c++/best-practice-ntt/run.sh
+++ b/examples/c++/best-practice-ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -16,7 +16,7 @@ int main(int argc, char* argv[])
  int N = batch_size * msm_size;

  std::cout << "Part I: use G1 points" << std::endl;
-  
+
  std::cout << "Generating random inputs on-host" << std::endl;
  scalar_t* scalars = new scalar_t[N];
  affine_t* points = new affine_t[N];
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
    false, // is_async
  };
  config.batch_size = batch_size;
-  
+
  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
  cudaStream_t stream = config.ctx.stream;
  // Execute the MSM kernel
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -9,137 +9,148 @@
 using namespace poseidon;
 using namespace bn254;

-void checkCudaError(cudaError_t error) {
-    if (error != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
-        // Handle the error, e.g., exit the program or throw an exception.
-    }
+void checkCudaError(cudaError_t error)
+{
+  if (error != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+    // Handle the error, e.g., exit the program or throw an exception.
+  }
 }

 // these global constants go into template calls
 const int size_col = 11;

 // this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    // CHK_IF_RETURN(); I can't use it in a standard thread function
-    PoseidonConfig column_config = {
-        ctx,   // ctx
-        false, // are_inputes_on_device
-        false, // are_outputs_on_device
-        false, // input_is_a_state
-        false, // aligned
-        false, // loop_state
-        false, // is_async
-        };
-    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
-    checkCudaError(err);
+void threadPoseidon(
+  device_context::DeviceContext ctx,
+  unsigned size_partition,
+  scalar_t* layers,
+  scalar_t* column_hashes,
+  PoseidonConstants<scalar_t>* constants)
+{
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  // CHK_IF_RETURN(); I can't use it in a standard thread function
+  PoseidonConfig column_config = {
+    ctx,   // ctx
+    false, // are_inputes_on_device
+    false, // are_outputs_on_device
+    false, // input_is_a_state
+    false, // aligned
+    false, // loop_state
+    false, // is_async
+  };
+  cudaError_t err =
+    bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
+  checkCudaError(err);
 }

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

+#define CHECK_ALLOC(ptr)                                                                                               \
+  if ((ptr) == nullptr) {                                                                                              \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl;                                              \
+    exit(EXIT_FAILURE);                                                                                                \
+  }

-#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
-    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
-    exit(EXIT_FAILURE); \
-}
-
-int main() {
-    const unsigned size_row = (1<<30);
-    const unsigned nof_partitions = 64;
-    const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to reuse for different partitions
-    const uint32_t size_layers = size_col * size_partition;
-    
-    nvmlInit();
-    unsigned int deviceCount;
-    nvmlDeviceGetCount(&deviceCount);
-    std::cout << "Available GPUs: " << deviceCount << std::endl;
-
-    for (unsigned int i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        nvmlMemory_t memory;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-        nvmlDeviceGetHandleByIndex(i, &device);
-        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        nvmlDeviceGetMemoryInfo(device, &memory);
-        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
-    }
-
-    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
-    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
-
-    //===============================================================================
-    // Key: multiple devices are supported by device context
-    //===============================================================================
-
-    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-    ctx0.device_id=0;
-    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-    ctx1.device_id=1;
-    
-    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers0);
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers1);
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
-
-    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash0);
-    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash1);
-
-    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
-
-    std::cout << "Parallel execution of Poseidon threads" << std::endl;
-    START_TIMER(parallel);
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
-
-    // Wait for the threads to finish
-    thread0.join();
-    thread1.join();
-    END_TIMER(parallel,"2 GPUs");
-    std::cout << "Output Data from Thread 0: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 1: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    std::cout << "Sequential execution of Poseidon threads" << std::endl;
-    START_TIMER(sequential);
-    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    thread2.join();
-    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
-    thread3.join();
-    END_TIMER(sequential,"1 GPU");
-    std::cout << "Output Data from Thread 2: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 3: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    nvmlShutdown();
-    return 0;
+int main()
+{
+  const unsigned size_row = (1 << 30);
+  const unsigned nof_partitions = 64;
+  const unsigned size_partition = size_row / nof_partitions;
+  // layers is allocated only for one partition, need to reuse for different partitions
+  const uint32_t size_layers = size_col * size_partition;
+
+  nvmlInit();
+  unsigned int deviceCount;
+  nvmlDeviceGetCount(&deviceCount);
+  std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+  for (unsigned int i = 0; i < deviceCount; ++i) {
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    nvmlDeviceGetHandleByIndex(i, &device);
+    nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+    nvmlDeviceGetMemoryInfo(device, &memory);
+    std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
+              << "/" << memory.free / 1024 / 1024 << std::endl;
+  }
+
+  const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
+  std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+  //===============================================================================
+  // Key: multiple devices are supported by device context
+  //===============================================================================
+
+  device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+  ctx0.device_id = 0;
+  device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+  ctx1.device_id = 1;
+
+  std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+  scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers0);
+  scalar_t s = scalar_t::zero();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers0[i] = s;
+    s = s + scalar_t::one();
+  }
+  scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers1);
+  s = scalar_t::zero() + scalar_t::one();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers1[i] = s;
+    s = s + scalar_t::one();
+  }
+
+  scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash0);
+  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash1);
+
+  PoseidonConstants<scalar_t> column_constants0, column_constants1;
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+
+  std::cout << "Parallel execution of Poseidon threads" << std::endl;
+  START_TIMER(parallel);
+  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+  // Wait for the threads to finish
+  thread0.join();
+  thread1.join();
+  END_TIMER(parallel, "2 GPUs");
+  std::cout << "Output Data from Thread 0: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 1: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  std::cout << "Sequential execution of Poseidon threads" << std::endl;
+  START_TIMER(sequential);
+  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  thread2.join();
+  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+  thread3.join();
+  END_TIMER(sequential, "1 GPU");
+  std::cout << "Output Data from Thread 2: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 3: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  nvmlShutdown();
+  return 0;
 }
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -17,7 +17,7 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont
  config.is_a_on_device = true;
  config.is_b_on_device = true;
  config.is_result_on_device = true;
-  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -100,7 +100,7 @@ int main(int argc, char** argv)
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }
-  
+
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
@@ -151,7 +151,7 @@ int main(int argc, char** argv)
  // validate multiplication here...

  // clean up and exit
-  free(host_in1); 
+  free(host_in1);
  free(host_in2);
  free(host_out);
  cudaFree(device_in1);
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -60,8 +60,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char* argv[])
 {
@@ -89,16 +89,16 @@ int main(int argc, char* argv[])
  bn254_initialize_domain(&basic_root, ctx, true);
  // Create an NTTConfig instance
  NTTConfig<S> config = default_ntt_config<S>();
-  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  config.ntt_algorithm = NttAlgorithm::MixedRadix;
  config.batch_size = nof_ntts;
  START_TIMER(MixedRadix);
  cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(MixedRadix, "MixedRadix NTT");
-  
+
  std::cout << "Validating output" << std::endl;
  validate_output(ntt_size, nof_ntts, output);

-  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  config.ntt_algorithm = NttAlgorithm::Radix2;
  START_TIMER(Radix2);
  err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(Radix2, "Radix2 NTT");
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -11,49 +11,47 @@ using namespace bn254;
 typedef point_field_t T;

 // modular power
-T modPow(T base, T exp) {
+T modPow(T base, T exp)
+{
  T r = T::one();
  T b = base;
  T e = exp;
  while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
+    // If exp is odd, multiply the base with result
+    if (T::is_odd(e)) { r = r * b; }
+    // Now exp must be even, divide it by 2
+    e = T::div2(e);
+    b = b * b;
  }
  return r;
 }

 // Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
+bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); }

 // modular square root adapted from:
 // https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
+bool mySQRT(T a, T* result)
+{
  if (a == T::zero()) {
    *result = T::zero();
    return true;
  }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
+  if (modPow(a, T::div2(T::zero() - T::one())) != T::one()) {
    return false; // solution does not exist
  }
  // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
+  // if (p % 4 == 3) return power(a, (p + 1) / 4, p);
+  T s = T::zero() - T::one(); // p - 1,
+  T n = T::one() + T::one();  // 2;
+  T r = T::zero();
  T m;
  while (T::is_even(s)) {
    r = r + T::one();
-    s = T::div2(s); //s /= 2;
+    s = T::div2(s); // s /= 2;
  }
  // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
+  while (modPow(n, T::div2((T::zero() - T::one()))) != T::zero() - T::one()) {
    n = n + T::one();
  }
  T x = modPow(a, T::div2(s + T::one()));
@@ -61,83 +59,86 @@ bool mySQRT(T a, T *result) {
  T g = modPow(n, s);
  for (;; r = m) {
    T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
+    for (m = T::zero(); T::lt(m, r) /* m < r*/ && t != T::one(); m = m + T::one())
+      t = t * t;
+    if (m == T::zero()) {
      *result = x;
      return true;
    }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
+    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()));
+    g = gs * gs;
+    x = x * gs;
+    b = b * g;
  }
 }

-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
+void point_near_x(T x, affine_t* point)
+{
+  const T wb = T{weierstrass_b};
  T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
+  while (y2 = x * x * x + wb, quadratic_residue(y2) == false) {
    x = x + T::one();
  };
  T y;
  bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
+  assert(y * y == y2);
  point->x = x;
  point->y = y;
 }

 static int seed = 0;
 static HOST_INLINE T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
+{
+  std::mt19937_64 generator(seed++);
+  std::uniform_int_distribution<unsigned> distribution;
+
+  T value;
+  for (unsigned i = 0; i < T::TLC - 1; i++)
    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
+    value.limbs_storage.limbs[i] = distribution(generator);
+  // while (lt(Field{get_modulus()}, value))
+  //   value = value - Field{get_modulus()};
+  return value;
+}

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char** argv)
 {
  const unsigned N = pow(2, 10);
  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
+  T* xs = new T[N + 1];
+
  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
+  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment"
+            << std::endl;
  seed = 1234;
  std::cout << "Using seed: " << seed << std::endl;
  std::cout << "Generating random field values" << std::endl;
  START_TIMER(gen);
-  
+
  for (unsigned i = 0; i < N; i++) {
    xs[i] = rand_host_seed();
  }
  END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
+  std::cout << "xs[0]: " << xs[0] << std::endl;
+  std::cout << "xs[1]: " << xs[1] << std::endl;
+
  // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
+  affine_t* points = new affine_t[N + 1];
  std::cout << "Generating point about random field values" << std::endl;
  START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
+  for (unsigned i = 0; i < N + 1; i++) {
    point_near_x(xs[i], &points[i]);
  }
  END_TIMER(points, "Time to generate points");
-  
+
  std::cout << "Generating commitment vector" << std::endl;
  projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
+  scalar_t* scalars = new scalar_t[N + 1];
  scalar_t::rand_host_many(scalars, N);

  std::cout << "Generating salt" << std::endl;
@@ -146,7 +147,7 @@ int main(int argc, char** argv)
  std::cout << "Executing MSM" << std::endl;
  auto config = msm::default_msm_config();
  START_TIMER(msm);
-  bn254_msm_cuda(scalars, points, N+1, config, &result);
+  bn254_msm_cuda(scalars, points, N + 1, config, &result);
  END_TIMER(msm, "Time to execute MSM");

  std::cout << "Computed commitment: " << result << std::endl;
--- a/examples/c++/polynomial-api/CMakeLists.txt
+++ b/examples/c++/polynomial-api/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -DCURVE_ID=BN254")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(example PRIVATE "../../../icicle/include")
+
+# can link to another curve/field by changing the following lib and FIELD_ID
+target_link_libraries(example 
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a
+)
+target_compile_definitions(example PUBLIC FIELD_ID BN254)
--- a/examples/c++/polynomial-api/README.md
+++ b/examples/c++/polynomial-api/README.md
@@ -0,0 +1,49 @@
+# ICICLE examples: computations with polynomials
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+Polynomials are crucial for Zero-Knowledge Proofs (ZKPs): they enable efficient representation and verification of computational statements, facilitate privacy-preserving protocols, and support complex mathematical operations essential for constructing and verifying proofs without revealing underlying data. Polynomial API is documented [here](https://dev.ingonyama.com/icicle/polynomials/overview)
+
+## Running the example
+
+To run example, from project root directory:
+
+```sh
+cd examples/c++/polynomial-api
+./compile.sh
+./run.sh
+```
+
+To change the scalar field, modify `compile.h` to build the corresponding lib and `CMakeLists.txt` to link to that lib and set `FIELD_ID` correspondingly.
+
+## What's in the examples
+
+- `example_evaluate`: Make polynomial from coefficients and evalue it at random point.
+
+- `example_clone`: Make a separate copy of a polynomial.
+
+- `example_from_rou`: Reconstruct polynomial from values at the roots of unity. This operation is a cornerstone in the efficient implementation of zero-knowledge proofs, particularly in the areas of proof construction, verification, and polynomial arithmetic. By leveraging the algebraic structure and computational properties of roots of unity, ZKP protocols can achieve the scalability, efficiency, and privacy necessary for practical applications in blockchain, secure computation, and beyond.
+
+- `example_addition`, `example_addition_inplace`: Different flavors of polynomial addition.
+
+- `example_multiplication`: A product of two polynimials
+
+- `example_multiplicationScalar`: A product of scalar and a polynomial.
+
+- `example_monomials`: Add/subtract a monomial to a polynom. Monomial is a single term, which is the product of a constant coefficient and a variable raised to a non-negative integer power.
+
+- `example_ReadCoeffsToHost`: Download coefficients of a polynomial to a host. `ICICLE` keeps all polynomials on GPU, for on-host operation one needs such an operation.
+
+- `example_divisionSmall`, `example_divisionLarge`: Different flavors of division.
+
+- `example_divideByVanishingPolynomial`: A vanishing polynomial over a set S is a polynomial that evaluates to zero for every element in S. For a simple case, consider the set S={a}, a single element. The polynomial f(x)=x−a vanishes over S because f(a)=0. Mathematically, dividing a polynomial P(x) by a vanishing polynomial V(x) typically involves finding another polynomial Q(x) and possibly a remainder R(x) such that P(x)=Q(x)V(x)+R(x), where R(x) has a lower degree than V(x). In many cryptographic applications, the focus is on ensuring that P(x) is exactly divisible by V(x), meaning R(x)=0.
+
+- `example_EvenOdd`: even (odd) methods keep even (odd) coefficients of the original polynomial. For $f(x) = 1+2x+3x^2+4x^3$, even polynomial is $1+3x$, odd polynomial is $2+4x$.
+
+- `example_Slice`: extends even/odd methods and keeps coefficients for a given offset and stride. For $f(x) = 1+2x+3x^2+4x^3$, origin 0 stride 3 slice gives $1+4x$
+
+- `example_DeviceMemoryView`: device-memory views of polynomials allow "pass" polynomials to other GPU functions. In this example the coefficients of a polynomial are committed to a Merkle tree bypassing the host.
--- a/examples/c++/polynomial-api/compile.sh
+++ b/examples/c++/polynomial-api/compile.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/polynomial-api/example.cu
+++ b/examples/c++/polynomial-api/example.cu
@@ -0,0 +1,450 @@
+#include <iostream>
+#include <cassert>
+#include "polynomials/polynomials.h"
+#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+#include "ntt/ntt.cuh"
+#include "poseidon/tree/merkle.cuh"
+#include "api/bn254.h"
+#include <chrono>
+
+// using namespace field_config;
+using namespace polynomials;
+using namespace merkle;
+using namespace bn254;
+
+// define the polynomial type
+typedef Polynomial<scalar_t> Polynomial_t;
+
+// we'll use the following constants in the examples
+const auto zero = scalar_t::zero();
+const auto one = scalar_t::one();
+const auto two = scalar_t::from(2);
+const auto three = scalar_t::from(3);
+const auto four = scalar_t::from(4);
+const auto five = scalar_t::from(5);
+const auto minus_one = zero - one;
+
+static std::unique_ptr<scalar_t[]> generate_pows(scalar_t tau, uint32_t size){
+    auto vec = std::make_unique<scalar_t[]>(size);
+    vec[0] = scalar_t::one();
+    for (size_t i = 1; i < size; ++i) {
+      vec[i] = vec[i-1] * tau;
+  }
+  return std::move(vec);
+}
+
+static std::unique_ptr<affine_t[]> generate_SRS(uint32_t size) {
+  auto secret_scalar = scalar_t::rand_host();
+  auto gen = projective_t::generator();
+  auto pows_of_tau = generate_pows(secret_scalar,size);
+  auto SRS = std::make_unique<affine_t[]>(size);
+  for (size_t i = 0; i < size; ++i) {
+      SRS[i] = projective_t::to_affine(pows_of_tau[i] * gen);
+  }
+  return std::move(SRS);
+}
+
+
+void example_evaluate()
+{
+  std::cout << std::endl << "Example: Polynomial evaluation on random value" << std::endl;
+  const scalar_t coeffs[3] = {one, two, three};
+  auto f = Polynomial_t::from_coefficients(coeffs, 3);
+  std::cout << "f = " << f << std::endl;
+  scalar_t x = scalar_t::rand_host();
+  std::cout << "x = " << x << std::endl;
+  auto fx = f(x);
+  std::cout << "f(x) = " << fx << std::endl;
+}
+
+void example_from_rou(const int size)
+{
+  std::cout << std::endl << "Example: Reconstruct polynomial from values at roots of unity" << std::endl;
+  const int log_size = (int)ceil(log2(size));
+  const int nof_evals = 1 << log_size;
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++)
+    coeff[i] = scalar_t::rand_host();
+  auto f = Polynomial_t::from_coefficients(coeff.get(), size);
+  // rou: root of unity
+  auto omega = scalar_t::omega(log_size);
+  scalar_t evals[nof_evals] = {scalar_t::zero()};
+  auto x = scalar_t::one();
+  for (int i = 0; i < nof_evals; ++i) {
+    evals[i] = f(x);
+    x = x * omega;
+  }
+  // reconstruct f from evaluations
+  auto fr = Polynomial_t::from_rou_evaluations(evals, nof_evals);
+  // check for equality f-fr==0
+  auto h = f - fr;
+  std::cout << "degree of f - fr = " << h.degree() << std::endl;
+}
+
+static Polynomial_t randomize_polynomial(uint32_t size)
+{
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++)
+    coeff[i] = scalar_t::rand_host();
+  return Polynomial_t::from_coefficients(coeff.get(), size);
+}
+
+static Polynomial_t incremental_values(uint32_t size)
+{
+  auto coeff = std::make_unique<scalar_t[]>(size);
+  for (int i = 0; i < size; i++) {
+    coeff[i] = i ? coeff[i - 1] + scalar_t::one() : scalar_t::one();
+  }
+  return Polynomial_t::from_coefficients(coeff.get(), size);
+}
+
+static bool is_equal(Polynomial_t& lhs, Polynomial_t& rhs)
+{
+  const int deg_lhs = lhs.degree();
+  const int deg_rhs = rhs.degree();
+  if (deg_lhs != deg_rhs) { return false; }
+  auto lhs_coeffs = std::make_unique<scalar_t[]>(deg_lhs);
+  auto rhs_coeffs = std::make_unique<scalar_t[]>(deg_rhs);
+  lhs.copy_coeffs(lhs_coeffs.get(), 1, deg_lhs - 1);
+  rhs.copy_coeffs(rhs_coeffs.get(), 1, deg_rhs - 1);
+  return memcmp(lhs_coeffs.get(), rhs_coeffs.get(), deg_lhs * sizeof(scalar_t)) == 0;
+}
+
+void example_addition(const int size0, const int size1)
+{
+  std::cout << std::endl << "Example: Polynomial addition" << std::endl;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+  auto x = scalar_t::rand_host();
+  auto f_x = f(x);
+  auto g_x = g(x);
+  auto fx_plus_gx = f_x + g_x;
+  auto h = f + g;
+  auto h_x = h(x);
+  std::cout << "evaluate and add: " << fx_plus_gx << std::endl;
+  std::cout << "add and evaluate: " << h_x << std::endl;
+}
+
+void example_addition_inplace(const int size0, const int size1)
+{
+  std::cout << std::endl << "Example: Polynomial inplace addition" << std::endl;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+
+  auto x = scalar_t::rand_host();
+  auto f_x = f(x);
+  auto g_x = g(x);
+  auto fx_plus_gx = f_x + g_x;
+  f += g;
+  auto s_x = f(x);
+  std::cout << "evaluate and add: " << fx_plus_gx << std::endl;
+  std::cout << "add and evaluate: " << s_x << std::endl;
+}
+
+void example_multiplication(const int log0, const int log1)
+{
+  std::cout << std::endl << "Example: Polynomial multiplication" << std::endl;
+  const int size0 = 1 << log0, size1 = 1 << log1;
+  auto f = randomize_polynomial(size0);
+  auto g = randomize_polynomial(size1);
+  scalar_t x = scalar_t::rand_host();
+  auto fx = f(x);
+  auto gx = g(x);
+  auto fx_mul_gx = fx * gx;
+  auto m = f * g;
+  auto mx = m(x);
+  std::cout << "evaluate and multiply: " << fx_mul_gx << std::endl;
+  std::cout << "multiply and evaluate: " << mx << std::endl;
+}
+
+void example_multiplication_scalar(const int log0)
+{
+  std::cout << std::endl << "Example: Scalar by Polynomial multiplication" << std::endl;
+  const int size = 1 << log0;
+  auto f = randomize_polynomial(size);
+  auto s = scalar_t::from(2);
+  auto g = s * f;
+  auto x = scalar_t::rand_host();
+  auto fx = f(x);
+  auto fx2 = s * fx;
+  auto gx = g(x);
+  std::cout << "Compare (2*f)(x) and 2*f(x): " << std::endl;
+  std::cout << gx << std::endl;
+  std::cout << fx2 << std::endl;
+}
+
+void example_monomials()
+{
+  std::cout << std::endl << "Example: Monomials" << std::endl;
+  const scalar_t coeffs[3] = {one, zero, two}; // 1+2x^2
+  auto f = Polynomial_t::from_coefficients(coeffs, 3);
+  const auto x = three;
+  auto fx = f(x);
+  f.add_monomial_inplace(three, 1); // add 3x
+  const auto expected_addmonmon_f_x = fx + three * x;
+  const auto addmonom_f_x = f(x);
+  std::cout << "Computed f'(x) = " << addmonom_f_x << std::endl;
+  std::cout << "Expected f'(x) = " << expected_addmonmon_f_x << std::endl;
+}
+
+void example_read_coeffs_to_host()
+{
+  std::cout << std::endl << "Example: Read coefficients to host" << std::endl;
+  const scalar_t coeffs_f[3] = {zero, one, two}; // 0+1x+2x^2
+  auto f = Polynomial_t::from_coefficients(coeffs_f, 3);
+  const scalar_t coeffs_g[3] = {one, one, one}; // 1+x+x^2
+  auto g = Polynomial_t::from_coefficients(coeffs_g, 3);
+  auto h = f + g; // 1+2x+3x^3
+  std::cout << "Get one coefficient of h() at a time: " << std::endl;
+  const auto h0 = h.get_coeff(0);
+  const auto h1 = h.get_coeff(1);
+  const auto h2 = h.get_coeff(2);
+  std::cout << "Coefficients of h: " << std::endl;
+  std::cout << "0:" << h0 << " expected: " << one << std::endl;
+  std::cout << "1:" << h1 << " expected: " << two << std::endl;
+  std::cout << "2:" << h2 << " expected: " << three << std::endl;
+  std::cout << "Get all coefficients of h() at a time: " << std::endl;
+
+  scalar_t h_coeffs[3] = {0};
+  // fetch the coefficients for a given range
+  auto nof_coeffs = h.copy_coeffs(h_coeffs, 0, 2);
+  scalar_t expected_h_coeffs[nof_coeffs] = {one, two, three};
+  for (int i = 0; i < nof_coeffs; ++i) {
+    std::cout << i << ":" << h_coeffs[i] << " expected: " << expected_h_coeffs[i] << std::endl;
+  }
+}
+
+void example_division_small()
+{
+  std::cout << std::endl << "Example: Polynomial division (small)" << std::endl;
+  const scalar_t coeffs_a[4] = {five, zero, four, three}; // 3x^3+4x^2+5
+  const scalar_t coeffs_b[3] = {minus_one, zero, one};    // x^2-1
+  auto a = Polynomial_t::from_coefficients(coeffs_a, 4);
+  auto b = Polynomial_t::from_coefficients(coeffs_b, 3);
+  auto [q, r] = a.divide(b);
+  scalar_t q_coeffs[2] = {0}; // 3x+4
+  scalar_t r_coeffs[2] = {0}; // 3x+9
+  const auto q_nof_coeffs = q.copy_coeffs(q_coeffs, 0, 1);
+  const auto r_nof_coeffs = r.copy_coeffs(r_coeffs, 0, 1);
+  std::cout << "Quotient: 0:" << q_coeffs[0] << " expected: " << scalar_t::from(4) << std::endl;
+  std::cout << "Quotient: 1:" << q_coeffs[1] << " expected: " << scalar_t::from(3) << std::endl;
+  std::cout << "Reminder: 0:" << r_coeffs[0] << " expected: " << scalar_t::from(9) << std::endl;
+  std::cout << "Reminder: 1:" << r_coeffs[1] << " expected: " << scalar_t::from(3) << std::endl;
+}
+
+void example_division_large(const int log0, const int log1)
+{
+  std::cout << std::endl << "Example: Polynomial division (large)" << std::endl;
+  const int size0 = 1 << log0, size1 = 1 << log1;
+  auto a = randomize_polynomial(size0);
+  auto b = randomize_polynomial(size1);
+  auto [q, r] = a.divide(b);
+  scalar_t x = scalar_t::rand_host();
+  auto ax = a(x);
+  auto bx = b(x);
+  auto qx = q(x);
+  auto rx = r(x);
+  // check if a(x) == b(x)*q(x)+r(x)
+  std::cout << "a(x) == b(x)*q(x)+r(x)" << std::endl;
+  std::cout << "lhs = " << ax << std::endl;
+  std::cout << "rhs = " << bx * qx + rx << std::endl;
+}
+
+void example_divide_by_vanishing_polynomial()
+{
+  std::cout << std::endl << "Example: Polynomial division by vanishing polynomial" << std::endl;
+  const scalar_t coeffs_v[5] = {minus_one, zero, zero, zero, one}; // x^4-1 vanishes on 4th roots of unity
+  auto v = Polynomial_t::from_coefficients(coeffs_v, 5);
+  auto h = incremental_values(1 << 11);
+  auto hv = h * v;
+  auto [h_div, R] = hv.divide(v);
+  std::cout << "h_div == h: " << is_equal(h_div, h) << std::endl;
+  auto h_div_by_vanishing = hv.divide_by_vanishing_polynomial(4);
+  std::cout << "h_div_by_vanishing == h: " << is_equal(h_div_by_vanishing, h) << std::endl;
+}
+
+void example_clone(const int log0)
+{
+  std::cout << std::endl << "Example: clone polynomial" << std::endl;
+  const int size = 1 << log0;
+  auto f = randomize_polynomial(size);
+  const auto x = scalar_t::rand_host();
+  const auto fx = f(x);
+  Polynomial_t g;
+  g = f.clone();
+  g += f;
+  auto h = g.clone();
+  std::cout << "g(x) = " << g(x) << " expected: " << two * fx << std::endl;
+  std::cout << "h(x) = " << h(x) << " expected: " << g(x) << std::endl;
+}
+
+void example_even_odd()
+{
+  std::cout << std::endl << "Example: Split into even and odd powers " << std::endl;
+  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
+  auto f = Polynomial_t::from_coefficients(coeffs, 4);
+  auto f_even = f.even();
+  auto f_odd = f.odd();
+  scalar_t even_coeffs[2] = {0};
+  scalar_t odd_coeffs[2] = {0};
+  const auto even_nof_coeffs = f_even.copy_coeffs(even_coeffs, 0, 1);
+  const auto odd_nof_coeffs = f_odd.copy_coeffs(odd_coeffs, 0, 1);
+  std::cout << "Even: 0:" << even_coeffs[0] << " expected: " << one << std::endl;
+  std::cout << "Even: 1:" << even_coeffs[1] << " expected: " << three << std::endl;
+  std::cout << "Odd: 0:" << odd_coeffs[0] << " expected: " << two << std::endl;
+  std::cout << "Odd: 1:" << odd_coeffs[1] << " expected: " << four << std::endl;
+}
+
+void example_slice()
+{
+  std::cout << std::endl << "Example: Slice polynomial " << std::endl;
+  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
+  auto f = Polynomial_t::from_coefficients(coeffs, 4);
+  auto f_slice = f.slice(0 /*=offset*/, 3 /*= stride*/, 2 /*/= size*/); // 1+4x
+  scalar_t slice_coeffs[2] = {0};
+  const auto slice_nof_coeffs = f_slice.copy_coeffs(slice_coeffs, 0, 1);
+  std::cout << "Slice: 0:" << slice_coeffs[0] << " expected: " << one << std::endl;
+  std::cout << "Slice: 1:" << slice_coeffs[1] << " expected: " << four << std::endl;
+}
+
+void example_device_memory_view()
+{
+  const int log_size = 6;
+  const int size = 1 << log_size;
+  auto f = randomize_polynomial(size);
+  auto [d_coeffs, N, device_id] = f.get_coefficients_view();
+
+  // compute coset evaluations
+  auto coset_evals = std::make_unique<scalar_t[]>(size);
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  ntt_config.are_inputs_on_device = true; // using the device data directly as a view
+  ntt_config.coset_gen = ntt::get_root_of_unity<scalar_t>(size * 2);
+  ntt::ntt(d_coeffs.get(), size, ntt::NTTDir::kForward, ntt_config, coset_evals.get());
+}
+
+
+void example_commit_with_device_memory_view()
+{
+  //declare time vars
+  std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
+  std::chrono::milliseconds duration;
+
+  std::cout << std::endl << "Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1" << std::endl;
+  std::cout<< "Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1" << std::endl;
+  int N = 1025;
+
+  //generate group elements string of length N: (1, beta,beta^2....,beta^{N-1}). g
+  std::cout << "Setup: Generating mock SRS" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto SRS = generate_SRS(2*N);
+  //Allocate memory on device (points)
+  affine_t* points_d;
+  cudaMalloc(&points_d, sizeof(affine_t)* 2 * N);
+  // copy SRS to device (could have generated on device, but gives an indicator)
+  cudaMemcpy(points_d, SRS.get(), sizeof(affine_t)* 2 * N, cudaMemcpyHostToDevice);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: SRS of length "<< N << " generated and loaded to device. Took: " << duration.count() << " milliseconds" << std::endl;
+  
+  //goal:
+  //test commitment equality [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1
+  //test commitment equality [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1
+  //note: using polyapi to gen scalars: already on device. 
+  std::cout << "Setup: Generating polys (on device) f1,f2 of log degree " << log2(N-1) << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto f1 = randomize_polynomial(N);
+  auto f2 = randomize_polynomial(N);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: Gen poly done. Took: " << duration.count() << " milliseconds" << std::endl;
+ 
+  //deg 2N constraints (f1+f2)^2 + (f1-f2)^2 = 2 (f1^2+ f_2^2)
+  std::cout << "Computing constraints..start "<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto L1 = (f1+f2)*(f1+f2) + (f1-f2)*(f1-f2);
+  auto R1 = scalar_t::from(2) * (f1*f1 + f2*f2);
+  //deg 2N constraints (f1+f2)^2 - (f1-f2)^2 = 4 f1 *f_2
+  auto L2 = (f1+f2)*(f1+f2) - (f1-f2)*(f1-f2);
+  auto R2 = scalar_t::from(4) * f1 * f2;
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Computing constraints..done. Took: " << duration.count() << " milliseconds"<< std::endl;
+  
+  // extract coeff using coeff view
+  auto [viewL1, sizeL1, device_idL1] = L1.get_coefficients_view();
+  auto [viewL2, sizeL2, device_idL2] = L2.get_coefficients_view(); 
+  auto [viewR1, sizeR1, device_idR1] = R1.get_coefficients_view();
+  auto [viewR2, sizeR2, device_idR2] = R2.get_coefficients_view();
+  
+  std::cout << "Computing Commitments with poly view"<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  msm::MSMConfig config = msm::default_msm_config();
+  config.are_points_on_device = true;
+  config.are_scalars_on_device = true;
+ 
+  //host vars (for result)
+  projective_t hL1{}, hL2{}, hR1{}, hR2{};
+
+  //straightforward msm bn254 api: no batching
+  bn254_msm_cuda(viewL1.get(),points_d,N,config,&hL1);
+  bn254_msm_cuda(viewL2.get(),points_d,N,config,&hL2);
+  bn254_msm_cuda(viewR1.get(),points_d,N,config,&hR1);
+  bn254_msm_cuda(viewR2.get(),points_d,N,config,&hR2);
+
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Commitments done. Took: " << duration.count() << " milliseconds"<< std::endl;
+ 
+  //sanity checks
+  auto affL1 = projective_t::to_affine(hL1);
+  auto affR1 = projective_t::to_affine(hR1);
+
+  auto affL2 = projective_t::to_affine(hL2);
+  auto affR2 = projective_t::to_affine(hR2);
+
+ //test commitment equality [(f1+f2)^2 + (f1-f2)^2]_1 = [4 (f_1^2+f_2^2]_1
+  assert(affL1.x==affR1.x && affL1.y==affR1.y);
+  std::cout << "commitment [(f1+f2)^2 + (f1-f2)^2]_1:" << std::endl; 
+  std::cout << "[x: " << affL1.x << ", y: " << affL1.y << "]" << std::endl;
+  std::cout << "commitment [[2 (f_1^2+f_2^2]_1:" <<std::endl;
+  std::cout << "[x: " << affR1.x << ", y: " << affR1.y << "]" << std::endl;
+
+  assert(affL2.x==affR2.x && affL2.y==affR2.y);
+  std::cout << "commitment [(f1+f2)^2 - (f1-f2)^2]_1:"<< std::endl;
+  std::cout << "[x: " << affL2.x << ", y: " << affL2.y << "]" << std::endl;
+  std::cout << "commitment [4 f_1*f_2]_1:"<<std::endl;
+  std::cout << "[x: " << affR2.x << ", y: " << affR2.y << "]" << std::endl;
+}
+
+
+
+int main(int argc, char** argv)
+{
+  // Initialize NTT. TODO: can we hide this in the library?
+  static const int MAX_NTT_LOG_SIZE = 24;
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
+  ntt::init_domain(basic_root, ntt_config.ctx);
+
+  // Virtual factory design pattern: initializing polynomimals factory for CUDA backend
+  Polynomial_t::initialize(std::make_unique<CUDAPolynomialFactory<>>());
+
+  example_evaluate();
+  example_clone(10);
+  example_from_rou(100);
+  example_addition(12, 17);
+  example_addition_inplace(2, 2);
+  example_multiplication(15, 12);
+  example_multiplication_scalar(15);
+  example_monomials();
+  example_read_coeffs_to_host();
+  example_division_small();
+  example_division_large(12, 2);
+  example_divide_by_vanishing_polynomial();
+  example_even_odd();
+  example_slice();
+  example_device_memory_view();
+  example_commit_with_device_memory_view();
+
+  return 0;
+}
--- a/examples/c++/polynomial-api/run.sh
+++ b/examples/c++/polynomial-api/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -82,10 +82,10 @@ int main(int argc, char** argv)
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
      vec_ops::VecOpsConfig config{
        ntt_config.ctx,
-        true,  // is_a_on_device
-        true,  // is_b_on_device
-        true,  // is_result_on_device
-        false  // is_async
+        true, // is_a_on_device
+        true, // is_b_on_device
+        true, // is_result_on_device
+        false // is_async
      };
      CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));

--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -14,12 +14,13 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level

 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t> * constants, PoseidonConfig config)
+  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
 {
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    bn254_poseidon_hash_cuda(
+      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
  }
 }

@@ -37,11 +38,7 @@ uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_he
 }

 void generate_proof(
-  uint32_t position,
-  scalar_t* tree,
-  const uint32_t tree_height,
-  uint32_t* proof_lr,
-  scalar_t* proof_hash)
+  uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash)
 {
  uint32_t level_index = position;
  for (uint32_t level = tree_height - 1; level > 0; level--) {
@@ -68,7 +65,7 @@ uint32_t validate_proof(
  const uint32_t tree_height,
  const uint32_t* proof_lr,
  const scalar_t* proof_hash,
-  PoseidonConstants<scalar_t> * constants,
+  PoseidonConstants<scalar_t>* constants,
  PoseidonConfig config)
 {
  scalar_t hashes_in[2], hash_out[1], level_hash;
@@ -114,13 +111,13 @@ int main(int argc, char* argv[])
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
  PoseidonConstants<scalar_t> constants;
  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity+1); 
+  PoseidonConfig config = default_poseidon_config(data_arity + 1);
  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);

  std::cout << "3. Building Merkle tree" << std::endl;
  PoseidonConstants<scalar_t> tree_constants;
  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
+  PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
  build_tree(tree_height, tree, &tree_constants, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
@@ -142,7 +139,7 @@ int main(int argc, char* argv[])
  std::cout << "6. Tamper the hash" << std::endl;
  const scalar_t tampered_hash = hash + scalar_t::one();
  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
-  
+
  std::cout << "7. Invalidate tamper hash membership" << std::endl;
  std::cout << "Validated: " << validated << std::endl;
  return 0;
--- a/examples/golang/msm/README.md
+++ b/examples/golang/msm/README.md
@@ -0,0 +1,34 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Usage
+
+```go
+err := Msm(
+  /* Scalars input vector */ scalars,
+  /* Points input vector */ points,
+  /* MSMConfig reference */ &cfg,
+  /* Projective point result */ results)
+```
+
+In this example we use `BN254` and `BLS12377` curves. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default sizes are 2^17 - 2^22. You can change this by passing the `-l <size> -u <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> go run main.go -l=21 -u=24
+> ```
--- a/examples/golang/msm/main.go
+++ b/examples/golang/msm/main.go
@@ -0,0 +1,209 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/g2"
+	bls12377Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/msm"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	bn254Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
+)
+
+func main() {
+	var logSizeMin int
+	var logSizeMax int
+
+	flag.IntVar(&logSizeMin, "l", 17, "Minimum log size")
+	flag.IntVar(&logSizeMax, "u", 22, "Maximum log size")
+	flag.Parse()
+
+	sizeMax := 1 << logSizeMax
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254Max := bn254.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 points ... ")
+	startTime = time.Now()
+	pointsBn254Max := bn254.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 G2 points ... ")
+	startTime = time.Now()
+	pointsBn254G2Max := bn254G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377Max := bls12377.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 points ... ")
+	startTime = time.Now()
+	pointsBls12377Max := bls12377.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 G2 points ... ")
+	startTime = time.Now()
+	pointsBls12377G2Max := bls12377G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	for logSize := logSizeMin; logSize <= logSizeMax; logSize++ {
+
+		// Define the size of the problem, here 2^18.
+		size := 1 << logSize
+
+		fmt.Printf("---------------------- MSM size 2^%d=%d ------------------------\n", logSize, size)
+
+		// println(scalarsBls12377, pointsBls12377, pointsBn254G2)
+		// println(scalarsBn254, pointsBn254, pointsBls12377G2)
+
+		print("Configuring bn254 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBn254 := scalarsBn254Max[:size]
+		pointsBn254 := pointsBn254Max[:size]
+		pointsBn254G2 := pointsBn254G2Max[:size]
+
+		cfgBn254 := core.GetDefaultMSMConfig()
+		cfgBn254G2 := core.GetDefaultMSMConfig()
+		cfgBn254.IsAsync = true
+		cfgBn254G2.IsAsync = true
+
+		streamBn254, _ := cr.CreateStream()
+		streamBn254G2, _ := cr.CreateStream()
+
+		cfgBn254.Ctx.Stream = &streamBn254
+		cfgBn254G2.Ctx.Stream = &streamBn254G2
+
+		var projectiveBn254 bn254.Projective
+		var projectiveBn254G2 bn254G2.G2Projective
+
+		var msmResultBn254 core.DeviceSlice
+		var msmResultBn254G2 core.DeviceSlice
+
+		_, e := msmResultBn254.MallocAsync(projectiveBn254.Size(), projectiveBn254.Size(), streamBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBn254G2.MallocAsync(projectiveBn254G2.Size(), projectiveBn254G2.Size(), streamBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Configuring Bls12377 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBls12377 := scalarsBls12377Max[:size]
+		pointsBls12377 := pointsBls12377Max[:size]
+		pointsBls12377G2 := pointsBls12377G2Max[:size]
+
+		cfgBls12377 := core.GetDefaultMSMConfig()
+		cfgBls12377G2 := core.GetDefaultMSMConfig()
+		cfgBls12377.IsAsync = true
+		cfgBls12377G2.IsAsync = true
+
+		streamBls12377, _ := cr.CreateStream()
+		streamBls12377G2, _ := cr.CreateStream()
+
+		cfgBls12377.Ctx.Stream = &streamBls12377
+		cfgBls12377G2.Ctx.Stream = &streamBls12377G2
+
+		var projectiveBls12377 bls12377.Projective
+		var projectiveBls12377G2 bls12377G2.G2Projective
+
+		var msmResultBls12377 core.DeviceSlice
+		var msmResultBls12377G2 core.DeviceSlice
+
+		_, e = msmResultBls12377.MallocAsync(projectiveBls12377.Size(), projectiveBls12377.Size(), streamBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBls12377G2.MallocAsync(projectiveBls12377G2.Size(), projectiveBls12377G2.Size(), streamBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Executing bn254 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bn254Msm.Msm(scalarsBn254, pointsBn254, &cfgBn254, msmResultBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bn254G2.G2Msm(scalarsBn254, pointsBn254G2, &cfgBn254G2, msmResultBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBn254Host := make(core.HostSlice[bn254.Projective], 1)
+		msmResultBn254G2Host := make(core.HostSlice[bn254G2.G2Projective], 1)
+
+		msmResultBn254Host.CopyFromDeviceAsync(&msmResultBn254, streamBn254)
+		msmResultBn254G2Host.CopyFromDeviceAsync(&msmResultBn254G2, streamBn254G2)
+
+		msmResultBn254.FreeAsync(streamBn254)
+		msmResultBn254G2.FreeAsync(streamBn254G2)
+
+		cr.SynchronizeStream(&streamBn254)
+		cr.SynchronizeStream(&streamBn254G2)
+
+		println(time.Since(startTime).String())
+
+		print("Executing Bls12377 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bls12377Msm.Msm(scalarsBls12377, pointsBls12377, &cfgBls12377, msmResultBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bls12377G2.G2Msm(scalarsBls12377, pointsBls12377G2, &cfgBls12377G2, msmResultBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBls12377Host := make(core.HostSlice[bls12377.Projective], 1)
+		msmResultBls12377G2Host := make(core.HostSlice[bls12377G2.G2Projective], 1)
+
+		msmResultBls12377Host.CopyFromDeviceAsync(&msmResultBls12377, streamBls12377)
+		msmResultBls12377G2Host.CopyFromDeviceAsync(&msmResultBls12377G2, streamBls12377G2)
+
+		msmResultBls12377.FreeAsync(streamBls12377)
+		msmResultBls12377G2.FreeAsync(streamBls12377G2)
+
+		cr.SynchronizeStream(&streamBls12377)
+		cr.SynchronizeStream(&streamBls12377G2)
+
+		println(time.Since(startTime).String())
+	}
+}
--- a/examples/golang/ntt/README.md
+++ b/examples/golang/ntt/README.md
@@ -0,0 +1,39 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Golang
+
+## Key-Takeaway
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Usage
+
+```go
+err := Ntt(
+  /* input slice */ scalars,
+  /* NTT Direction */ core.KForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ result)
+```
+
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT.
+2. Generate random inputs
+3. Set up the domain.
+4. Configure NTT
+5. Execute NTT on-device
+6. Move the result on host
+
+Running the example:
+
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `-s <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+go run main.go -s=23
+```
--- a/examples/golang/ntt/main.go
+++ b/examples/golang/ntt/main.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/ntt"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+
+	bls12377Fft "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft"
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+)
+
+func main() {
+	var logSize int
+
+	flag.IntVar(&logSize, "s", 20, "Log size")
+	flag.Parse()
+
+	size := 1 << logSize
+
+	fmt.Printf("---------------------- NTT size 2^%d=%d ------------------------\n", logSize, size)
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254 := bn254.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBn254 := bn254Ntt.GetDefaultNttConfig()
+	cfgBn254.IsAsync = true
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377 := bls12377.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBls12377 := bls12377Ntt.GetDefaultNttConfig()
+	cfgBls12377.IsAsync = true
+
+	rouMontBn254, _ := bn254Fft.Generator(uint64(size))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	bn254Ntt.InitDomain(rouIcicleBn254, cfgBn254.Ctx, false)
+
+	rouMontBls12377, _ := bls12377Fft.Generator(uint64(size))
+	rouBls12377 := rouMontBls12377.Bits()
+	rouIcicleBls12377 := bls12377.ScalarField{}
+	limbsBls12377 := core.ConvertUint64ArrToUint32Arr(rouBls12377[:])
+	rouIcicleBls12377.FromLimbs(limbsBls12377)
+	bls12377Ntt.InitDomain(rouIcicleBls12377, cfgBls12377.Ctx, false)
+
+	print("Configuring bn254 NTT ... ")
+	startTime = time.Now()
+
+	streamBn254, _ := cr.CreateStream()
+
+	cfgBn254.Ctx.Stream = &streamBn254
+
+	var nttResultBn254 core.DeviceSlice
+
+	_, e := nttResultBn254.MallocAsync(size*scalarsBn254.SizeOfElement(), scalarsBn254.SizeOfElement(), streamBn254)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Configuring Bls12377 NTT ... ")
+	startTime = time.Now()
+
+	streamBls12377, _ := cr.CreateStream()
+
+	cfgBls12377.Ctx.Stream = &streamBls12377
+
+	var nttResultBls12377 core.DeviceSlice
+
+	_, e = nttResultBls12377.MallocAsync(size*scalarsBls12377.SizeOfElement(), scalarsBls12377.SizeOfElement(), streamBls12377)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bls12_377 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Executing bn254 NTT on device ... ")
+	startTime = time.Now()
+
+	err := bn254Ntt.Ntt(scalarsBn254, core.KForward, &cfgBn254, nttResultBn254)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bn254 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBn254Host := make(core.HostSlice[bn254.ScalarField], size)
+	nttResultBn254Host.CopyFromDeviceAsync(&nttResultBn254, streamBn254)
+	nttResultBn254.FreeAsync(streamBn254)
+	cr.SynchronizeStream(&streamBn254)
+	println(time.Since(startTime).String())
+
+	print("Executing Bls12377 NTT on device ... ")
+	startTime = time.Now()
+
+	err = bls12377Ntt.Ntt(scalarsBls12377, core.KForward, &cfgBls12377, nttResultBls12377)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bls12_377 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBls12377Host := make(core.HostSlice[bls12377.ScalarField], size)
+	nttResultBls12377Host.CopyFromDeviceAsync(&nttResultBls12377, streamBls12377)
+	nttResultBls12377.FreeAsync(streamBls12377)
+
+	cr.SynchronizeStream(&streamBls12377)
+
+	println(time.Since(startTime).String())
+}
--- a/examples/golang/polynomials/README.md
+++ b/examples/golang/polynomials/README.md
@@ -0,0 +1,49 @@
+# ICICLE example: Polynomials in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Polynomials](https://dev.ingonyama.com/icicle/polynomials/overview).
+
+## Usage
+### Backend Initialization
+```go
+InitPolyBackend()
+```
+### Construction
+
+```go
+poly1 := CreateFromCoeffecitients(/* Coefficients of polynomial */ coeffs)
+poly2 := CreateFromROUEvaluations(/* evaluations */ evals)
+poly3 := Clone(/* polynomial to clone */ poly1)
+```
+
+### Arithmetic
+
+```go
+polyAdd := poly1.Add(&poly2)
+polySub := poly1.Subtract(&poly2)
+polyMul := poly1.Multiply(&poly2)
+polyMulScalar := MultiplyByScalar(scalar)
+quotient, remainder := poly1.Divide(&poly2)
+```
+
+### Evaluation
+
+```go
+ev := poly1.Eval(scalar)
+ev2 := poly1.EvalOnDomain(scalars)
+```
+
+In this example we use `BN254` and `Babybear` fields. The examples shows arithmetic operations and evaluations execution.
+
+## What's in the example
+
+1. Define the size of polynomials. 
+2. Initialize backends.
+3. Generate random polynomials.
+4. Execute arithmetic operations.
+5. Execute evaluations.
+6. Execute slicing.
+
+Running the example:
+```sh
+go run main.go
+```
--- a/examples/golang/polynomials/main.go
+++ b/examples/golang/polynomials/main.go
@@ -0,0 +1,114 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+	bn254Polynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/polynomial"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	babybear "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear"
+	babybearNtt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/ntt"
+	babybearPolynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/polynomial"
+)
+
+var maxNttLogSize uint
+var polyLogSize uint
+
+func initBn254Domain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouMontBn254, _ := bn254Fft.Generator(uint64(1 << maxNttLogSize))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	return bn254Ntt.InitDomain(rouIcicleBn254, deviceCfg, false)
+}
+
+func initBabybearDomain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouIcicle := babybear.ScalarField{}
+	rouIcicle.FromUint32(1461624142)
+	return babybearNtt.InitDomain(rouIcicle, deviceCfg, false)
+}
+
+func init() {
+	flag.UintVar(&maxNttLogSize, "maxNttLogSize", 20, "")
+	flag.UintVar(&polyLogSize, "polyLogSize", 15, "")
+
+	e := initBn254Domain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Domain initialization failed: ", e)
+		panic(errorString)
+	}
+	e = initBabybearDomain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Babybear Domain initialization failed: ", e)
+		panic(errorString)
+	}
+
+	bn254Polynomial.InitPolyBackend()
+	babybearPolynomial.InitPolyBackend()
+}
+func main() {
+	polySize := 1 << polyLogSize
+
+	// randomize three polynomials over bn254 scalar field
+	var fBn254 bn254Polynomial.DensePolynomial
+	var gBn254 bn254Polynomial.DensePolynomial
+	var hBn254 bn254Polynomial.DensePolynomial
+	fBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize))
+	gBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize / 2))
+	hBn254.CreateFromROUEvaluations(bn254.GenerateScalars(polySize / 4))
+
+	// randomize two polynomials over babybear field
+	var fBabybear babybearPolynomial.DensePolynomial
+	var gBabybear babybearPolynomial.DensePolynomial
+	fBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize))
+	gBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize / 2))
+
+	// Arithmetic
+	t0 := fBn254.Add(&gBn254)
+	t1 := fBn254.Multiply(&hBn254)
+	q, r := t1.Divide(&t0)
+	rBabybear := fBabybear.Add(&gBabybear)
+	rDegree := r.Degree()
+	_ = rBabybear
+	_ = rDegree
+
+	// evaluate in single domain point
+	var five bn254.ScalarField
+	five.FromUint32(5)
+	qAtFive := q.Eval(five)
+
+	var thirty bn254.ScalarField
+	thirty.FromUint32(30)
+
+	// evaluate on domain. Note: domain and image can be either Host or Device slice.
+	// in this example domain in on host and evals on device.
+	hostDomain := core.HostSliceFromElements([]bn254.ScalarField{five, thirty})
+	var deviceImage core.DeviceSlice
+	_, err := deviceImage.Malloc(five.Size()*hostDomain.Len(), five.Size())
+	if err != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"deviceImage allocation failed: ", err)
+		panic(errorString)
+	}
+	t1.EvalOnDomain(hostDomain, deviceImage)
+
+	// slicing
+	o := hBn254.Odd()
+	e := hBn254.Even()
+
+	oddMult := o.MultiplyByScalar(qAtFive)
+	fold := e.Add(&oddMult) // e(x) + o(x)*scalar
+
+	coeff := fold.GetCoeff(2) // coeff of x^2
+	_ = coeff
+}
--- a/go.mod
+++ b/go.mod
@@ -1,4 +1,4 @@
-module github.com/ingonyama-zk/icicle
+module github.com/ingonyama-zk/icicle/v2

 go 1.20

--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -14,51 +14,42 @@ endfunction()
 function(set_gpu_env)
    # add the target cuda architectures
    # each additional architecture increases the compilation time and output file size
-    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    if(DEFINED CUDA_ARCH) # user defined arch takes priority
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
    else()
-    find_program(_nvidia_smi "nvidia-smi")
+        find_program(_nvidia_smi "nvidia-smi")

-    if(_nvidia_smi)
-        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+        if(_nvidia_smi)
+            execute_process(
+                COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
+                OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            # Process the output to form the CUDA architectures string
+            string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")

-        # execute nvidia-smi -L to get a short list of GPUs available
-        exec_program(${_nvidia_smi_path} ARGS -L
-        OUTPUT_VARIABLE _nvidia_smi_out
-        RETURN_VALUE _nvidia_smi_ret)
+            set(CUDA_ARCHITECTURES "")
+            foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
+                # Remove the dot in compute capability to match CMake format
+                string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
+                if(CUDA_ARCHITECTURES)
+                    set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
+                else()
+                    set(CUDA_ARCHITECTURES "${CAPABILITY}")
+                endif()
+            endforeach()

-        # process the stdout of nvidia-smi
-        if(_nvidia_smi_ret EQUAL 0)
-        # convert string with newlines to list of strings
-        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-        foreach(_line ${_nvidia_smi_out})
-            if(_line MATCHES "^GPU [0-9]+:")
-            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-            # the UUID is not very useful for the user, remove it
-            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-            if(NOT _gpu_info STREQUAL "")
-                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-            endif()
-            endif()
-        endforeach()
-
-        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")        
+            set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)                        
+        else()
+            # no GPUs found, like on Github CI runners
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: 50") 
+            set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
        endif()
    endif()

-    # ##
-    if(DETECT_GPU_COUNT GREATER 0)
-        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
-    else()
-        # no GPUs found, like on Github CI runners
-        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
-    endif()
-    endif()
-
    # Check CUDA version and, if possible, enable multi-threaded compilation 
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
        message(STATUS "Using multi-threaded CUDA compilation.")
@@ -69,4 +60,4 @@ function(set_gpu_env)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
-endfunction()
+endfunction()
--- a/icicle/cmake/FieldsCommon.cmake
+++ b/icicle/cmake/FieldsCommon.cmake
@@ -1,5 +1,5 @@
 function(check_field)
-  set(SUPPORTED_FIELDS babybear)
+  set(SUPPORTED_FIELDS babybear;stark252)

  set(IS_FIELD_SUPPORTED FALSE)
  set(I 1000)
@@ -14,4 +14,4 @@ function(check_field)
  if (NOT IS_FIELD_SUPPORTED)
    message( FATAL_ERROR "The value of FIELD variable: ${FIELD} is not one of the supported fields: ${SUPPORTED_FIELDS}" )
  endif ()
-endfunction()
+endfunction()
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -12,16 +12,53 @@
 #include "fields/stark_fields/babybear.cuh"
 #include "ntt/ntt.cuh"
 #include "vec_ops/vec_ops.cuh"
+#include "poseidon/poseidon.cuh"
+#include "poseidon/tree/merkle.cuh"
+#include "poseidon2/poseidon2.cuh"

 extern "C" cudaError_t babybear_extension_ntt_cuda(
  const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);

+extern "C" cudaError_t babybear_create_poseidon2_constants_cuda(
+  int width,
+  int alpha,
+  int internal_rounds,
+  int external_rounds,
+  const babybear::scalar_t* round_constants,
+  const babybear::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t babybear_init_poseidon2_constants_cuda(
+  int width,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t babybear_poseidon2_hash_cuda(
+  const babybear::scalar_t* input,
+  babybear::scalar_t* output,
+  int number_of_states,
+  int width,
+  const poseidon2::Poseidon2Constants<babybear::scalar_t>& constants,
+  poseidon2::Poseidon2Config& config);
+
+extern "C" cudaError_t babybear_release_poseidon2_constants_cuda(
+  poseidon2::Poseidon2Constants<babybear::scalar_t>* constants,
+  device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t babybear_mul_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

 extern "C" cudaError_t babybear_add_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

+extern "C" cudaError_t babybear_accumulate_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t babybear_sub_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

@@ -34,6 +71,12 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t babybear_bit_reverse_cuda(
+  const babybear::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  babybear::scalar_t* output);
+
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);

 extern "C" cudaError_t babybear_scalar_convert_montgomery(
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
  bls12_377::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_377_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(

 extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
  bls12_377::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::affine_t* output_bases);

 extern "C" cudaError_t bls12_377_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bls12_377_mul_cuda(
 extern "C" cudaError_t bls12_377_add_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

+extern "C" cudaError_t bls12_377_accumulate_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_377_sub_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_377_bit_reverse_cuda(
+  const bls12_377::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bls12_377::scalar_t* output);
+
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
  bls12_381::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_381_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(

 extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
  bls12_381::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::affine_t* output_bases);

 extern "C" cudaError_t bls12_381_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bls12_381_mul_cuda(
 extern "C" cudaError_t bls12_381_add_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

+extern "C" cudaError_t bls12_381_accumulate_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_381_sub_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_381_bit_reverse_cuda(
+  const bls12_381::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bls12_381::scalar_t* output);
+
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -15,14 +15,12 @@
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
 #include "poseidon/tree/merkle.cuh"
+#include "poseidon2/poseidon2.cuh"

 extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
  bn254::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::g2_affine_t* output_bases);

 extern "C" cudaError_t bn254_g2_msm_cuda(
@@ -30,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(

 extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
  bn254::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::affine_t* output_bases);

 extern "C" cudaError_t bn254_msm_cuda(
@@ -71,6 +66,37 @@ extern "C" cudaError_t bn254_affine_convert_montgomery(
 extern "C" cudaError_t bn254_projective_convert_montgomery(
  bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

+extern "C" cudaError_t bn254_create_poseidon2_constants_cuda(
+  int width,
+  int alpha,
+  int internal_rounds,
+  int external_rounds,
+  const bn254::scalar_t* round_constants,
+  const bn254::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bn254_init_poseidon2_constants_cuda(
+  int width,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t bn254_poseidon2_hash_cuda(
+  const bn254::scalar_t* input,
+  bn254::scalar_t* output,
+  int number_of_states,
+  int width,
+  const poseidon2::Poseidon2Constants<bn254::scalar_t>& constants,
+  poseidon2::Poseidon2Config& config);
+
+extern "C" cudaError_t bn254_release_poseidon2_constants_cuda(
+  poseidon2::Poseidon2Constants<bn254::scalar_t>* constants,
+  device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
  int arity,
  int full_rounds_half,
@@ -104,6 +130,9 @@ extern "C" cudaError_t bn254_mul_cuda(
 extern "C" cudaError_t bn254_add_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

+extern "C" cudaError_t bn254_accumulate_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bn254_sub_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

@@ -116,6 +145,12 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bn254_bit_reverse_cuda(
+  const bn254::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bn254::scalar_t* output);
+
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);

 extern "C" cudaError_t bn254_scalar_convert_montgomery(
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
  bw6_761::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::g2_affine_t* output_bases);

 extern "C" cudaError_t bw6_761_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(

 extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
  bw6_761::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::affine_t* output_bases);

 extern "C" cudaError_t bw6_761_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bw6_761_mul_cuda(
 extern "C" cudaError_t bw6_761_add_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

+extern "C" cudaError_t bw6_761_accumulate_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bw6_761_sub_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bw6_761_bit_reverse_cuda(
+  const bw6_761::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bw6_761::scalar_t* output);
+
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);

 extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -17,11 +17,8 @@

 extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
  grumpkin::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  grumpkin::affine_t* output_bases);

 extern "C" cudaError_t grumpkin_msm_cuda(
@@ -74,6 +71,9 @@ extern "C" cudaError_t grumpkin_mul_cuda(
 extern "C" cudaError_t grumpkin_add_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

+extern "C" cudaError_t grumpkin_accumulate_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t grumpkin_sub_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

@@ -86,6 +86,12 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t grumpkin_bit_reverse_cuda(
+  const grumpkin::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  grumpkin::scalar_t* output);
+
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);

 extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
--- a/icicle/include/api/hash.h
+++ b/icicle/include/api/hash.h
@@ -8,9 +8,9 @@
 #include "hash/keccak/keccak.cuh"

 extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);

 extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);

 #endif
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -0,0 +1,56 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef STARK252_API_H
+#define STARK252_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "fields/stark_fields/stark252.cuh"
+#include "ntt/ntt.cuh"
+#include "vec_ops/vec_ops.cuh"
+
+extern "C" cudaError_t stark252_mul_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_add_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_accumulate_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
+extern "C" cudaError_t stark252_sub_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
+
+extern "C" cudaError_t stark252_transpose_matrix_cuda(
+  const stark252::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  stark252::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" cudaError_t stark252_bit_reverse_cuda(
+  const stark252::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  stark252::scalar_t* output);
+
+extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
+
+extern "C" cudaError_t stark252_scalar_convert_montgomery(
+  stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t stark252_initialize_domain(
+  stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t stark252_ntt_cuda(
+  const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
+
+extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
+
+#endif
--- a/icicle/include/api/templates/curves/msm.h
+++ b/icicle/include/api/templates/curves/msm.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
  ${CURVE}::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_msm_cuda(
--- a/icicle/include/api/templates/curves/msm_g2.h
+++ b/icicle/include/api/templates/curves/msm_g2.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
  ${CURVE}::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::g2_affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_g2_msm_cuda(
--- a/icicle/include/api/templates/fields/poseidon2.h
+++ b/icicle/include/api/templates/fields/poseidon2.h
@@ -0,0 +1,30 @@
+extern "C" cudaError_t ${FIELD}_create_poseidon2_constants_cuda(
+  int width,
+  int alpha,
+  int internal_rounds,
+  int external_rounds,
+  const ${FIELD}::scalar_t* round_constants,
+  const ${FIELD}::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t ${FIELD}_init_poseidon2_constants_cuda(
+  int width,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx,
+  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
+
+extern "C" cudaError_t ${FIELD}_poseidon2_hash_cuda(
+  const ${FIELD}::scalar_t* input,
+  ${FIELD}::scalar_t* output,
+  int number_of_states,
+  int width,
+  const poseidon2::Poseidon2Constants<${FIELD}::scalar_t>& constants,
+  poseidon2::Poseidon2Config& config);
+
+extern "C" cudaError_t ${FIELD}_release_poseidon2_constants_cuda(
+  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* constants,
+  device_context::DeviceContext& ctx);
--- a/icicle/include/api/templates/fields/vec_ops.h
+++ b/icicle/include/api/templates/fields/vec_ops.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda(
 extern "C" cudaError_t ${FIELD}_add_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

+extern "C" cudaError_t ${FIELD}_accumulate_cuda(
+  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_sub_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

--- a/icicle/include/api/templates/fields/vec_ops_ext.h
+++ b/icicle/include/api/templates/fields/vec_ops_ext.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda(
 extern "C" cudaError_t ${FIELD}_extension_add_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

+extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda(
+  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_extension_sub_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -175,7 +175,7 @@ public:
    UNROLL
 #endif
    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
-      if (i > 0) { res = res + res; }
+      if (i > 0) { res = dbl(res); }
      if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; }
    }
    return res;
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -125,6 +125,17 @@ public:
  struct Wide {
    ff_wide_storage limbs_storage;

+    static constexpr Wide HOST_DEVICE_INLINE from_field(const Field& xs)
+    {
+      Wide out{};
+#ifdef __CUDA_ARCH__
+      UNROLL
+#endif
+      for (unsigned i = 0; i < TLC; i++)
+        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
+      return out;
+    }
+
    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
    {
      Field out{};
@@ -990,6 +1001,17 @@ public:
    }
    return (u == one) ? b : c;
  }
+
+  static constexpr HOST_DEVICE_INLINE Field pow(Field base, int exp)
+  {
+    Field res = one();
+    while (exp > 0) {
+      if (exp & 1) res = res * base;
+      base = base * base;
+      exp >>= 1;
+    }
+    return res;
+  }
 };

 template <class CONFIG>
--- a/icicle/include/fields/field_config.cuh
+++ b/icicle/include/fields/field_config.cuh
@@ -30,6 +30,9 @@ namespace field_config = grumpkin;
 #elif FIELD_ID == BABY_BEAR
 #include "fields/stark_fields/babybear.cuh"
 namespace field_config = babybear;
+#elif FIELD_ID == STARK_252
+#include "fields/stark_fields/stark252.cuh"
+namespace field_config = stark252;
 #endif

 #endif
--- a/icicle/include/fields/id.h
+++ b/icicle/include/fields/id.h
@@ -9,5 +9,6 @@
 #define GRUMPKIN  5

 #define BABY_BEAR 1001
+#define STARK_252 1002

 #endif
--- a/icicle/include/fields/stark_fields/stark252.cuh
+++ b/icicle/include/fields/stark_fields/stark252.cuh
@@ -0,0 +1,631 @@
+#pragma once
+
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+
+// modulus = 3618502788666131213697322783095070105623107215331596699973092056135872020481 (2^251+17*2^192+1)
+namespace stark252 {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned modulus_bit_count = 252;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr unsigned omegas_count = 192;
+
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                     0x00000000, 0x00000000, 0x00000011, 0x08000000};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                       0x00000000, 0x00000000, 0x00000022, 0x10000000};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
+                                                       0x00000000, 0x00000000, 0x00000044, 0x20000000};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+                                                         0xffffffff, 0xffffffff, 0xffffffee, 0xf7ffffff};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000022, 0x10000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000121, 0x10000000, 0x00000001, 0x00400000};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000044, 0x20000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000242, 0x20000000, 0x00000002, 0x00800000};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000088, 0x40000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000484, 0x40000000, 0x00000004, 0x01000000};
+    static constexpr storage<limbs_count> m = {0x8c81fffb, 0x00000002, 0xfeccf000, 0xffffffff,
+                                               0x0000907f, 0x00000000, 0xffffffbc, 0x1fffffff};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xffffffe1, 0xffffffff, 0xffffffff, 0xffffffff,
+                                                          0xffffffff, 0xffffffff, 0xfffffdf0, 0x07ffffff};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                              0x00000121, 0x10000000, 0x00000001, 0x00400000};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000},
+       {0xf41337e3, 0x2a616626, 0xac8320da, 0xc5268e56, 0x4329f8c7, 0x53312066, 0x29a2995b, 0x06250239},
+       {0xee6feebb, 0x3ada5e1d, 0xe4412e87, 0x98c62155, 0x2f9c676e, 0xc90adb1e, 0x0de874d9, 0x063365fe},
+       {0x6021e539, 0x8337c45f, 0xbbf30245, 0xb0bdf467, 0x514425f3, 0x4537602d, 0x88826aba, 0x05ec467b},
+       {0x9b48a8ab, 0x2225638f, 0x1a8e7981, 0x26da375d, 0xce6246af, 0xfcdca219, 0x9ecd5c85, 0x0789ad45},
+       {0xb2703765, 0xd6871506, 0xf9e225ec, 0xd09bd064, 0x10826800, 0x5e869a07, 0xe82b2bb5, 0x0128f0fe},
+       {0xdd4af20f, 0xfdab65db, 0x56f9ddbc, 0xefa66822, 0x1b03a097, 0x587781ce, 0x9556f9b8, 0x000fcad1},
+       {0xff0cb347, 0x9f1bc8d7, 0xd0e87cd5, 0xc4d78992, 0xdd51a717, 0xbc7924d5, 0xfd121b58, 0x00c92ecb},
+       {0xc13a1d0b, 0xcc4074a0, 0xe3bc8e32, 0xa1f811a9, 0x6d4b9bd4, 0x0234b46e, 0x7880b4dc, 0x011d07d9},
+       {0xec89c4f1, 0xa206c054, 0xdc125289, 0x653d9e35, 0x711825f5, 0x72406af6, 0x46a03edd, 0x0659d839},
+       {0x0fa30710, 0x45391692, 0x11b54c6c, 0xd439f572, 0xa3492c1e, 0xed5ebbf4, 0xb5d9a6de, 0x010f4d91},
+       {0x7afd187f, 0x9273dbbc, 0x91ee171f, 0xdb5375bc, 0x6749ae3d, 0xc061f425, 0x6ec477cf, 0x003d14df},
+       {0x3112b02d, 0x8171e1da, 0xadf9bf78, 0x5c4564eb, 0x5689b232, 0x68c34184, 0x6538624f, 0x0363d70a},
+       {0x606082e1, 0x3e5a42f0, 0x76fc314a, 0x5edd09f0, 0x0f673d7c, 0xd650df25, 0x34832dba, 0x0393a32b},
+       {0x13a77460, 0xe3efc75d, 0x62ef8a01, 0x93898bc8, 0x8bdbd9b3, 0x1c3a6e5c, 0x611b7206, 0x034b5d5d},
+       {0x309d9da9, 0x80ee9837, 0xf51eddbc, 0x1646d633, 0x4901fab8, 0xb9d2cd85, 0x9978ee09, 0x01eb6d84},
+       {0x2755bfac, 0xa7b1f98c, 0xeb7aa1c1, 0x9ec8116c, 0x3109e611, 0x0eeadedd, 0xc9761a8a, 0x06a6f98d},
+       {0x9745a046, 0xce7b0a8b, 0xe411ee63, 0x7ff61841, 0x635f8799, 0x34f67453, 0xef852560, 0x04768803},
+       {0xbffaa9db, 0x1727fce0, 0xf973dc22, 0x858f5918, 0x223f6558, 0x3e277fa0, 0xf71614e3, 0x02d25658},
+       {0x8574e81f, 0xe3d47b99, 0x7fc4c648, 0xc727c9af, 0xee93dc85, 0x581d81ca, 0xca8a00d9, 0x0594beaf},
+       {0x0e5ffcb8, 0x00654744, 0xe7c1b2fd, 0x030530a6, 0xecbf157b, 0x27e46d76, 0xbeea04f1, 0x01f4c2bf},
+       {0x3e3a2f4b, 0xead33145, 0xd6482f17, 0xd841544d, 0x8d24a344, 0x9822fb10, 0x31eeac7c, 0x03e43835},
+       {0xb40bdbe8, 0x01af11c3, 0xb32a3b23, 0xd7c9c0a1, 0xcd0be360, 0x81cb2e43, 0xafb3df1a, 0x01054544},
+       {0x77156db2, 0xf6b13488, 0xddc0f211, 0x1ad6f3be, 0xd664f4da, 0xe643d3ea, 0x174a8e80, 0x071a47b8},
+       {0x4ca88ffc, 0xb86b03a4, 0x8ef9a25a, 0x6e3398e6, 0xf5fa4665, 0xce9a0d37, 0x5c437763, 0x06e8e769},
+       {0x4586dbc3, 0x32609f1d, 0xaa2da684, 0x03148f22, 0x4795d346, 0xa679e36b, 0x9e51225c, 0x03d8d2c7},
+       {0xea5f81cf, 0xeac5be9e, 0x64c12e72, 0x102e16b2, 0xfee282e4, 0xce0bc0d9, 0xa93b28f3, 0x01f05206},
+       {0xbb6422f9, 0x258e96d2, 0x617c5468, 0x751615d8, 0x6056f032, 0x27145cb6, 0x81c06d84, 0x057a7971},
+       {0xb030713c, 0xf42231bb, 0x3a96c59e, 0xae9c3f9a, 0xf1ee840c, 0x5397e8e2, 0xf2b87657, 0x05e7deca},
+       {0xf81f58b4, 0x209745aa, 0x91af248d, 0x74a64310, 0xc04b00b7, 0xe566a8e1, 0x80fb4cea, 0x022bde40},
+       {0x5de74517, 0x8265b62b, 0xb9b9f2c9, 0x6a788149, 0xa9565d98, 0x6fec2239, 0x573f0c28, 0x060ac0c4},
+       {0xd3ce8992, 0xc129d0f1, 0x81c43de5, 0x719252eb, 0x48221e1a, 0xfea566de, 0x0be8ced2, 0x050732ed},
+       {0x2216f1c8, 0x9aae0db3, 0xd7220015, 0x95e231ac, 0x6340df6f, 0xbd6ae160, 0x16a6e39c, 0x0166c8e2},
+       {0x76b0a92e, 0x3ccd9d2b, 0x7d671a9d, 0x1feb39d7, 0x2109fd56, 0x3c49a630, 0x5d4ec292, 0x07badc4b},
+       {0x5dd8c4c3, 0x081c3166, 0xec14ba21, 0x9dca12d8, 0xcf93b2e5, 0xf58069e2, 0x571ddc34, 0x02399005},
+       {0x08a616fc, 0x65a19cf4, 0x8aea6ff7, 0x860d442c, 0x6896a559, 0x4f24ab19, 0x3d7f5ae6, 0x0685db92},
+       {0x622478c4, 0x051093f0, 0x3fab8962, 0x5c200627, 0x21254c39, 0x2aa7ae1b, 0x7b116fb9, 0x0100fff9},
+       {0x00637050, 0x2693b834, 0x22440235, 0x3fef7c1b, 0x3481c4fe, 0x31150ac1, 0xf261b6de, 0x0772cb7a},
+       {0xd990d491, 0x6966804c, 0xc7505f35, 0x46aba1bc, 0xaceeb7f7, 0x4f696cba, 0x6474b8f0, 0x02b73cad},
+       {0xf39cd3e8, 0x7d13e948, 0x62a1db76, 0xd5c33593, 0x4d1be159, 0x7fd3b59b, 0x3676644e, 0x066d3f61},
+       {0xb3bd8b7e, 0x5a896ef3, 0xba5762ab, 0x2319450a, 0x1a545f8b, 0x226f0a07, 0x55446d35, 0x02760973},
+       {0x140e5623, 0x38eaa186, 0x94be15ba, 0x5a48d469, 0xad75d32a, 0xe4f1f15b, 0x2f14e2f1, 0x039ccdaa},
+       {0xe6fcfdb2, 0xad7108d3, 0x9c9f7f04, 0xfadfc050, 0x9df95366, 0xdbb20071, 0xe555c739, 0x02c4d3fa},
+       {0xc3111bcb, 0xb640956f, 0xbb11fb86, 0xcd942bbd, 0xa3db81cd, 0xa4b4eb09, 0x684fdb65, 0x041ed5ed},
+       {0xdd5ca525, 0x462b41fa, 0x153c3d28, 0xbcc17ccd, 0x6b06db5c, 0x8a81d137, 0x4a050358, 0x05f5cf39},
+       {0xcc60fb85, 0x374012a6, 0x34d1905d, 0x978f9785, 0x4e17ff38, 0x713383d4, 0x1055c25d, 0x07f3796f},
+       {0x0643771f, 0x852ba56e, 0x86781a31, 0xadfa956c, 0xb26a3811, 0x2ee2fccf, 0xdbd56ba7, 0x009214ce},
+       {0x68bc148c, 0xe2bf6c4b, 0x01c203ce, 0xd38dbf38, 0x97923b55, 0x27f73df4, 0x5081f7d9, 0x030a2e81},
+       {0xf11422a0, 0xbe23b78f, 0x99cdc2e0, 0xd4f3510d, 0xaa13ffe5, 0xcb05b3da, 0xc724e0c5, 0x028d98a5},
+       {0x96934000, 0x15277271, 0x588c8a51, 0x8013dd5e, 0x9ed55af8, 0x77772f7c, 0x03549e60, 0x020895f8},
+       {0x34db29f8, 0xc0cc8556, 0x67455b5d, 0x5582a9ff, 0x8a9a38b5, 0x12862a43, 0xa59fd242, 0x059655bc},
+       {0x94ceaf98, 0x39bc5131, 0xc71ccc0d, 0x99f4d1a0, 0x54acb87c, 0xc565794d, 0xc33590ef, 0x0593fcef},
+       {0xe97bf51c, 0xa2922d09, 0x3200d367, 0xdbb866a2, 0x4ad9302d, 0x05849ed8, 0xdf93f2b5, 0x000c447e},
+       {0x850fb317, 0x2755d6c2, 0xd45eb3f5, 0x36feeeea, 0xdfbc1d97, 0x4f4471d7, 0x4e3003f8, 0x07ec8926},
+       {0xb6a791f1, 0x38b8dc2a, 0x27a1bbb1, 0x79d6de48, 0xcad54cf2, 0x78c40b06, 0xa43bc898, 0x036dd150},
+       {0x1cc4133c, 0xefa72477, 0x477d39be, 0x5327d617, 0x2c5db3a4, 0xfd1de1f9, 0xc9a18a1c, 0x0147819b},
+       {0xf8133966, 0x275e6b02, 0x87969b48, 0x82bc79b9, 0x5d1e2f0e, 0x85b1f9bd, 0xc819531b, 0x00f9ea29},
+       {0x120edfab, 0x9e0392a5, 0xe3681a15, 0x07403ad4, 0x8a1c3817, 0xa8d469d8, 0x89f15c6f, 0x0395e7fc},
+       {0x641826ac, 0x7f405a9f, 0x6861e2ce, 0xa566e755, 0xba82a050, 0x8a3a08ba, 0xea63598d, 0x071dd923},
+       {0x5f65c188, 0x1d2b7538, 0xd6fc9625, 0xcb704d0f, 0xf59deccc, 0x18729111, 0x52fe1979, 0x07595020},
+       {0x8a08756f, 0x0175aa1c, 0x7fa7c6c4, 0x9a76a312, 0x6e93f6f3, 0x0bfa523a, 0x258c2f23, 0x03d70de4},
+       {0x8229376d, 0x8a0b9d02, 0x2c65c94e, 0x08421430, 0xd34b0aa6, 0x1160b441, 0xbbfb9491, 0x03b9eb75},
+       {0x827caf53, 0x91874856, 0x37e8a006, 0xdfdcae7a, 0x04e3af6b, 0x6dcfc3f2, 0xba66ff37, 0x0592823d},
+       {0x72fb8b0d, 0xb0a6628d, 0xa72b1f03, 0x7d3eef8b, 0x8dd54dbe, 0x5be965ba, 0x96d1fe4c, 0x0114a278},
+       {0x06051d55, 0x0256d8e6, 0xb9fa9dcc, 0xbf152353, 0x44140d6e, 0x6ef2c68c, 0xc9c0fea6, 0x015f291a},
+       {0xed992efc, 0xa1826724, 0x771da991, 0x9a58fd99, 0xd0b370a1, 0xce51a153, 0x826df846, 0x03c53bf5},
+       {0xcc7bf8c3, 0x3909aad7, 0xb08ddfa2, 0xd408ae7d, 0xff94d9fc, 0x2e9ab5d6, 0xf11cbcf6, 0x0020a1b2},
+       {0x3e257b43, 0x448fff07, 0x5fd9edca, 0x00f4a128, 0x7b429f71, 0x6f8987e3, 0x0fc8b522, 0x013336c1},
+       {0x062bd860, 0xef78ac4c, 0xf5d787d2, 0x6539ee52, 0xbb65576e, 0x113b6071, 0x9f3d7f85, 0x0160e952},
+       {0xf966d24e, 0x0c4e7c07, 0x318277e8, 0x011853d8, 0x7c287f58, 0x93bae650, 0xf64289f7, 0x00b974a1},
+       {0x30408cb9, 0x66d19420, 0x0430b017, 0x709ca6c6, 0x23d95951, 0xb174ad46, 0x111f4192, 0x030762f8},
+       {0xf246c901, 0xb9d70015, 0x57a1cdec, 0xd3616cb1, 0x0d732fdb, 0x61aab25e, 0x12d620d8, 0x0712858b},
+       {0x16334e1a, 0x8ec7e113, 0xa96aeeab, 0x0021a55b, 0xfd639175, 0x8f4c1366, 0x69bc866a, 0x07acdde9},
+       {0x23088fc7, 0x1fb24e5e, 0x92a88089, 0xcacd65df, 0x17343c48, 0x103ec3c8, 0xc387a3b5, 0x03d296b9},
+       {0xcd9fedee, 0xae703c5b, 0x7853b30d, 0xd0c3e0c6, 0x12abaef5, 0xc1e326b3, 0x5d57bb23, 0x04f42d7f},
+       {0x1824b92c, 0x19cd1b4e, 0x81ebc117, 0xc5daaff4, 0xb8183a1d, 0xeeedaa59, 0xe28baf8a, 0x069d8f0c},
+       {0x9dc50729, 0x9733e8df, 0xf1b9f411, 0xd7e0dbb9, 0x50edf7ea, 0x59e4dbd2, 0x4059cb5f, 0x002259fe},
+       {0xb79a92b1, 0x5e3197fc, 0x59086db1, 0xbfddf5c5, 0xdbea4a69, 0x234d8639, 0x4d0a367d, 0x05dd79b0},
+       {0xa86eec0c, 0x8cc1d845, 0x573b44d7, 0x3cac8839, 0x7b0de880, 0x8b8d8735, 0x68c99722, 0x01c5ef12},
+       {0xc2ba0f23, 0x12680395, 0x471f947e, 0xd43bcf85, 0xcc9d9b24, 0x19935b68, 0x108eec6a, 0x06263e1e},
+       {0x5b7be972, 0x29617bad, 0xc55b1b68, 0x0ab73eef, 0x2544381e, 0x07f12359, 0x63a080a0, 0x0161444d},
+       {0x312f9080, 0x07a4b921, 0x2f530413, 0x64c25a07, 0x7d71ca2f, 0x3f6903d7, 0x04838ba1, 0x06917cab},
+       {0x10bdb6cc, 0xec7cfc1f, 0x3bcf85c7, 0x7046910d, 0x7bc3ff5f, 0x7ef09e22, 0x385306d4, 0x004b0b60},
+       {0x3a41158a, 0x82d06d78, 0xaa690d1f, 0x37c4a361, 0x7117c44a, 0x700766e1, 0xab40d7e4, 0x031261d0},
+       {0x91b88258, 0x384c5e8b, 0x009b84dc, 0xd777abd5, 0xe7eed224, 0x02102b55, 0xdbefe5e9, 0x03b22830},
+       {0x8770a4be, 0xec982f60, 0x961f56ad, 0x4b92533d, 0xf428c4b9, 0x7df85fbb, 0x2d9291a4, 0x057e4876},
+       {0xf4910a60, 0x6ace9477, 0x9fc63b7f, 0xdb5a705f, 0x72328369, 0x4cc157b4, 0xc282db6f, 0x05b8acbc},
+       {0x57269216, 0x4c69edd9, 0xbfee24ac, 0xd04f1eeb, 0x2a069b18, 0xacda8418, 0x5990b523, 0x03761a4f},
+       {0xc608d246, 0x7f2e2048, 0x4664959b, 0xd4f52ed2, 0x11c1d565, 0x354e3bf7, 0x457eabd3, 0x0156d837},
+       {0xd455f483, 0xea8cbefd, 0x5d940684, 0x33cd5725, 0x8091a287, 0x2d89a777, 0x939b3ef3, 0x06159e4a},
+       {0x4fa405aa, 0xe43439f1, 0xdbe5763d, 0xa258cfc7, 0x78d7b607, 0x9491173a, 0x9ad23eac, 0x01775d66},
+       {0xd772d637, 0x2413e92c, 0x5eac4588, 0x22c99c9f, 0x71a0cdd2, 0xa2bd1d06, 0xfdd73a36, 0x05e88acb},
+       {0xb2bfa1ad, 0x68886b35, 0x35d2dfb6, 0x7a969b62, 0x9767a44a, 0x359ddb45, 0x52e5da6d, 0x00f1a46e},
+       {0x1c5a4861, 0x4ef9fe94, 0x1c841a89, 0x1540cf67, 0xa9bed4f5, 0x8b51336f, 0xf63c32ab, 0x0240fc41},
+       {0x87086e50, 0x7f5c626d, 0x049c46e2, 0x38ec0386, 0x0c597ea7, 0x30b003fd, 0x6660a912, 0x07a8faa1},
+       {0x7dac5d19, 0x2810d2b4, 0x80339f39, 0x040470c4, 0xc946ab30, 0x30d97769, 0x52667151, 0x019fa1f9},
+       {0x5e7c57a2, 0x00e13c8e, 0x2a0fb7bd, 0x95490ca0, 0x08451e35, 0x6af2b76d, 0xcf78c579, 0x04c3a3a1},
+       {0x55e39071, 0xa848b2f2, 0xf132ce21, 0x6831da1d, 0xe080e2ec, 0x439bdda4, 0xadd19a7d, 0x06680f09},
+       {0x6be27786, 0xfebd2a8b, 0x093a5a7f, 0x2cdd8f78, 0xdcb004b3, 0xbc0746a1, 0xd12450ed, 0x005f950a},
+       {0x39759f39, 0xe1462ca6, 0x7bbe087d, 0x0c37dca2, 0x0c8661cb, 0x198de347, 0x7e531b52, 0x03602655},
+       {0x66d7eb25, 0xaf24ead2, 0x5ee6eb03, 0x27cea560, 0x4f6267c7, 0xe9aa6d50, 0xe5dd28e0, 0x00c962b1},
+       {0xb11706c9, 0x3c3407a5, 0xcf0e1b88, 0x44370686, 0x9fbda5e3, 0x5d0e7af0, 0x41cf0a6b, 0x010d235f},
+       {0x358cfcc2, 0x1fbc42a3, 0xc78f7dac, 0x5a2e6ea2, 0xa12773f2, 0x33e089ca, 0xed7788c1, 0x04bef156},
+       {0xbea42f88, 0xdb150649, 0x5f3fb72a, 0x71329f69, 0x86b82de7, 0x7aa46ad0, 0xc6093912, 0x07913b17},
+       {0xb3b67067, 0xb2b074ae, 0xc55f4455, 0x4f17674d, 0xdeb0740d, 0x9a112816, 0x316cc0d3, 0x06bd0cde},
+       {0x1a264ab3, 0x962ceb6b, 0xd99f7159, 0xd5930255, 0x24a4096e, 0x7db961b0, 0x3e50dfed, 0x050c8e5c},
+       {0x443af109, 0xc3eebe54, 0x86946633, 0x2ca03fcb, 0x04badff6, 0x6e6eef04, 0x82210754, 0x05d92ab7},
+       {0xa5c0dca4, 0xcbadd8ad, 0x5ac103a0, 0x4cf688cf, 0x26e5d435, 0x571dbdb9, 0x220fc7db, 0x074ffc4d},
+       {0x88740c3e, 0x70b80432, 0x03821aa8, 0x4a959d50, 0xe4df06d8, 0x3eb8c3a0, 0xcac57496, 0x025a425b},
+       {0x55205413, 0xdcadfd29, 0x90b17b01, 0xda7456d2, 0x73696a28, 0x437c2fda, 0x329f6855, 0x00a8a188},
+       {0xa828431e, 0x3cde2cdd, 0x9ed29340, 0x60e6c362, 0x7c13e145, 0xef00dfa9, 0xba288c0b, 0x04159bec},
+       {0x9065f8ee, 0x41d351cd, 0xa4845868, 0x4e2e298f, 0xbdb3834a, 0xbcba6ac1, 0xea85f2ec, 0x042c8871},
+       {0x1fda880f, 0xc4dc0d20, 0x26fc2d5c, 0x4f0f9dc4, 0x86839de7, 0x2c555343, 0xf698dd8f, 0x04d12da8},
+       {0x21bd655a, 0x3a6299bd, 0x8cfd772f, 0x2e4aea22, 0xd2c2590d, 0x09716ad9, 0xb298587d, 0x053b143c},
+       {0xa95e3cbf, 0xd35f3e32, 0x04eac3cf, 0xe380dee7, 0x0f7e3e6b, 0x27e6570a, 0xbed46774, 0x008cd288},
+       {0x9583f023, 0xe42676b0, 0x75cfaa7e, 0x39d57dd6, 0x4f0bb727, 0x10d4a8d0, 0x27c81bdd, 0x016b03c9},
+       {0x4decc603, 0x89b394f7, 0xd24690f4, 0xd7322ee9, 0x947a00fd, 0xbbc12961, 0x82e8fa75, 0x00886d23},
+       {0xeb0faad4, 0x7b48a33b, 0x60e0b0c8, 0x4c11ef26, 0x36f0f791, 0x4163a401, 0xa4074faf, 0x07986fea},
+       {0x31d9587e, 0x96044919, 0x9049fd2d, 0xb1cab341, 0x9c0eea09, 0xf28c83c9, 0x5c6620aa, 0x033b74dd},
+       {0x13ee028c, 0xde558d16, 0x5d4233b0, 0x4dcf3932, 0x2e422803, 0x7bd46887, 0xe1261bff, 0x04b4757d},
+       {0xd48e9b00, 0x6c80848f, 0x10b6a121, 0x937c1e6e, 0xe9f2008c, 0x7782f8b8, 0x2bc7171c, 0x00217358},
+       {0x324228d8, 0xba523265, 0x682ee17c, 0x4ebe5506, 0x3be009f9, 0x6c646fe8, 0x8594b924, 0x046de7bc},
+       {0x3b50645a, 0x270aa33a, 0x2a9c6282, 0x28fd23fd, 0xcfe96515, 0x5b2fa771, 0x3f812377, 0x063039de},
+       {0xaba4060a, 0xa1da52b0, 0x0374be67, 0x7f191fd6, 0x0d7d2126, 0x14c64d05, 0xf7f77381, 0x00419cb7},
+       {0xe4b19319, 0x07eda692, 0x0fef654e, 0x6190d3f6, 0x0b21ca7e, 0x893b0916, 0x073c48b4, 0x0367a3c7},
+       {0xc520e3ea, 0x8fd405b2, 0x487e93c9, 0x73b4f714, 0xd5142cff, 0x70b7ee88, 0xa320eca2, 0x058fb800},
+       {0x72ef3623, 0x3b5a8740, 0xaff370fd, 0xbff4af42, 0xe338258e, 0x64c137b0, 0xc7afafca, 0x05ac9917},
+       {0x82ccc89a, 0x99c46a0d, 0x9ff87868, 0x05ae3209, 0xa489481f, 0x6249b2a4, 0xbaead348, 0x0056c235},
+       {0xba0ea95e, 0x5a0640f3, 0xc03af976, 0x518db5cd, 0x5a250a06, 0x1c3223aa, 0xbc3442eb, 0x0397b942},
+       {0xacf14a4f, 0x164f0705, 0x33eb6c0e, 0x386c2325, 0xd7264573, 0xdfaceff6, 0xd1e22f80, 0x00e94509},
+       {0x9ff51bc7, 0x8964ee48, 0x57bbca04, 0x3e0f5037, 0x6510630c, 0xe78d6c8d, 0xdf0a61c1, 0x041d6351},
+       {0x45aa1b58, 0x47892f3b, 0x915c1c70, 0x5a1787ba, 0x67f20d25, 0xbaa23359, 0x0c4bc4be, 0x00e1919f},
+       {0xb9975332, 0x2a87c37a, 0xcdecebc9, 0x95db523f, 0x1d0db226, 0x703949ee, 0x4c3842dd, 0x03152c1d},
+       {0xecfb6f72, 0x0eff7e6a, 0x9493a628, 0xb3a83455, 0xd596cd51, 0xced58dd1, 0x25ee51ff, 0x033dee78},
+       {0x72a30547, 0x1f4047ca, 0xd40b6d0f, 0x9feefa06, 0x94db1b38, 0x836ffd80, 0xa0992ed5, 0x037c79f6},
+       {0xceb3dffd, 0x7ffa095d, 0x768e2cb3, 0x23097a65, 0x373f6222, 0xd228b1f9, 0xc57feea2, 0x06309a6b},
+       {0xecd4c6f7, 0x7a5bead4, 0x7e70f7de, 0xab92043c, 0x220db8d8, 0xf78f890e, 0x2865a07e, 0x052eeb98},
+       {0xdf253531, 0x8e9a6336, 0xbafa937b, 0xb24b664a, 0x303b1f5a, 0xc89f660e, 0x876bd8c7, 0x07ea9749},
+       {0x1d4c3fec, 0xd958e726, 0x06fbef31, 0xa5eb368f, 0xba6a027d, 0x0c911679, 0x5f80f992, 0x06321b51},
+       {0x046b49b2, 0x3ca61d9e, 0x6aa9c29a, 0x616a47d6, 0x9e9462dc, 0x27a7ffeb, 0x8971b70e, 0x0794ed38},
+       {0x9f47496f, 0xdb259a57, 0xa6b0481c, 0x7f3e3f90, 0x4afab47a, 0x76f42726, 0xc5a79505, 0x07b9da96},
+       {0x57e7aeed, 0x908e6450, 0x81648127, 0xe86db2fb, 0x8dd76882, 0x53f3c573, 0x72327da6, 0x02b37324},
+       {0x73a220ec, 0x82a941c9, 0x7f25beea, 0xb4cbecb7, 0xbfb061d6, 0x746ded71, 0x641b3f3d, 0x00f7af27},
+       {0xcbd4ba67, 0x69b8f4df, 0x3d526981, 0x5ee3ac6f, 0x145cef8c, 0x9372af4e, 0x72a31ef1, 0x05cc1cc6},
+       {0x62d1ba57, 0xce898b0d, 0xee3fa47e, 0x86ba0504, 0x4395b70d, 0xc68233b1, 0x80eb8d60, 0x024cfa58},
+       {0x74d51c41, 0x8fa83850, 0x60f8f9da, 0x5824a285, 0xaf1bea48, 0xa7a2067e, 0x5455acc3, 0x04ba49f2},
+       {0x324c6039, 0x0a1e223e, 0x7b18a9d0, 0x28312228, 0x88b6ecda, 0xb60c1f93, 0x687ba365, 0x053097d8},
+       {0xa7dae551, 0x5604b398, 0xe2e11609, 0x51f02e33, 0xe58e2094, 0x0b51a085, 0x3a3ecc28, 0x078679d6},
+       {0x92d52444, 0xe24b5528, 0x33d0fa70, 0xf77e35ad, 0x9bcbfb57, 0x8af5a7b7, 0x022748d2, 0x015c5f15},
+       {0xc993b168, 0xc002185c, 0x293ad856, 0x5586addb, 0x8ec50726, 0x69c1bfcf, 0x5fd97ea1, 0x00d514fc},
+       {0x8866c747, 0x52d7a9a2, 0x01d6ee05, 0x9bd77465, 0xc3a87a88, 0x576adf96, 0xfa69f0ec, 0x0693e89a},
+       {0x05903be3, 0xcfe50d90, 0xcf739179, 0xbe651dd1, 0x2ae70678, 0xba80ffda, 0xb55b06cc, 0x051dbe40},
+       {0x5585a6f0, 0x4adb5947, 0x9fa37e68, 0x14634b99, 0xa2a910a8, 0x27da5fbf, 0xa99c704d, 0x022a91ce},
+       {0xe2ddaacd, 0xfabab7b8, 0x60cf9603, 0x1edf6a83, 0xbfadddd3, 0x20b04218, 0xa81dbffa, 0x03e0ddb6},
+       {0xda25c9fd, 0xf9c1e3a3, 0xac57ece3, 0x41ff4e1e, 0xdd684055, 0x9ba50868, 0x46d8156a, 0x01b30314},
+       {0xab76a462, 0x30e067cc, 0x08f1b99b, 0x2d84c4c2, 0x73edc56f, 0x6b399ae0, 0x62cfacb2, 0x02f187e1},
+       {0x34fc5356, 0xb085758e, 0xf805fedf, 0xbafe9a1c, 0x95272d01, 0x0bcf423c, 0x1feca651, 0x01df4a81},
+       {0x4c264e97, 0xd3bd9833, 0xc08b1798, 0xc0b192be, 0xdc3ed49e, 0x42724e80, 0xbaee9a58, 0x04100303},
+       {0xe49749c9, 0xb653c919, 0x09f8e2fc, 0x07dbe557, 0xca71e551, 0xbb172d28, 0x7989c8fd, 0x07f5f801},
+       {0xdf1d9004, 0x9412a9f3, 0xbe90d67e, 0xddcf6d66, 0x4692f803, 0x1dbfd679, 0x524c2944, 0x04f4fae1},
+       {0x5707d134, 0xd413afdf, 0x887fd7e9, 0xf8a339cf, 0x84883580, 0xf74544f4, 0x851739e0, 0x0554f72a},
+       {0x59824907, 0xe3827564, 0x421182c9, 0x352eab2a, 0x8f8530f2, 0x19138257, 0x20275950, 0x04e3bf44},
+       {0x33f928b7, 0xef7660f9, 0xf5952362, 0xb7cb0619, 0xf17eb8d7, 0x5b24913b, 0x8e8b8082, 0x00f4804c},
+       {0x5bd84f3e, 0xe7020613, 0x736a1659, 0x7ee777e1, 0x0795844b, 0x34ca7cb6, 0x7503ddc3, 0x07ce12e4},
+       {0x6d8408a5, 0xbbbafb3f, 0x519dadca, 0xe0f02915, 0x0670f5d4, 0x5acba199, 0x4a93340f, 0x0056db45},
+       {0xe404f6c5, 0x73f8a435, 0x01731858, 0x68cd3f7a, 0xd01f3de9, 0x214d3134, 0xd5d75a88, 0x05fb76be},
+       {0xf976eb41, 0x3a66ad86, 0xcd08787a, 0x6401b6d3, 0x7d1e82a8, 0x575950f3, 0x55ee9d49, 0x00e34b33},
+       {0x0cc5cbf4, 0xbff2f4e6, 0xec205dcd, 0x5a6b430d, 0xc94862af, 0xa8114ab3, 0x2fe8be1f, 0x0247ecf5},
+       {0x8b98bf40, 0xded3bc57, 0xe26b66b3, 0xb658c8c4, 0x8d4220db, 0x8bd91c55, 0x94d2adea, 0x00d109f2},
+       {0xedeaec42, 0x0fbfd336, 0x5d407ae8, 0xd94f928d, 0x727e74b5, 0xe5e4a16b, 0xc8c22dd8, 0x06a550df},
+       {0x135e0ee9, 0xe378a012, 0x856a1aef, 0x5be86512, 0xd8febe77, 0x7de04ce2, 0xea43d59b, 0x03ddeed6},
+       {0x005a1d86, 0xc04dc48c, 0x6f29053d, 0x64f4bbd2, 0x9be0aef5, 0x10b1b3db, 0xcc625a0b, 0x03745ca5},
+       {0x1f4f0e85, 0x6c72bd40, 0xc2069cba, 0x4234afd0, 0xb99395f4, 0xc25b262f, 0xae0874e2, 0x0605f6a2},
+       {0xdd756b6d, 0x9513e0d4, 0xf0c137cd, 0x5127a167, 0x7f01c538, 0x1a12a425, 0x00a4483b, 0x068b3aaf},
+       {0x79bc6c86, 0x7a5b3e70, 0x375dc240, 0x5a337909, 0xe111d6ce, 0x46d6fe3c, 0x2ff2ca50, 0x02708b05},
+       {0x1524ad8c, 0x1181eb95, 0x52294490, 0xd0744ddc, 0x848605cf, 0x88ed5b7b, 0xb478c12a, 0x04b9cb49},
+       {0x27105dae, 0x98cb2411, 0xed5c1361, 0x3efa8fae, 0xd498e337, 0x6fa736a5, 0x1e369b4f, 0x038e3b07},
+       {0x98c8db7f, 0xbc5915ae, 0x50425ae8, 0x1f3c8f96, 0xfa86658a, 0x77d60416, 0x28ec2dda, 0x02bc8b30},
+       {0xb94bc10e, 0xad6794f2, 0x7e80093a, 0x7463b3f3, 0x90db4c79, 0x7bf5af53, 0x965c0cc4, 0x031531c6},
+       {0x7cc1083d, 0x66425289, 0xa45d785f, 0x778ba471, 0xbbc94c16, 0xe3f5c599, 0x9b92e036, 0x02606413},
+       {0xcf287faf, 0x191a2ea9, 0x823ddf07, 0xe6406a78, 0xaabe912b, 0xabcf2825, 0x7c48649a, 0x021dab44},
+       {0x65375f6c, 0x9465d77c, 0x65370520, 0x924e189c, 0x918f0105, 0x8be0ca5f, 0xb1925509, 0x07586d27},
+       {0x9302ac44, 0xe4fa93cb, 0xbf87d840, 0xf381ebbd, 0x44793049, 0x5027e7d9, 0xd3f09392, 0x0230b5c3},
+       {0x31d48a82, 0x123e992e, 0x729d40e2, 0xef2990c6, 0x0f331903, 0x946813e3, 0x112a2c4d, 0x022f575e},
+       {0xd4ee8cf7, 0x4b44764e, 0xdb576ebc, 0x4d44cff8, 0x0ab93ba1, 0xc6185d3a, 0x7e3f1e78, 0x0520c2d3},
+       {0xbc46b8b4, 0xd9446736, 0x91e2ede1, 0xc7776293, 0x87689930, 0x0323845f, 0x379293ae, 0x061e359f},
+       {0xb49b3a0a, 0x767a1747, 0x2b58f45e, 0x17e69346, 0x1425ad98, 0x10820519, 0x1b487ae5, 0x0367f384},
+       {0x92f8ac25, 0xe0407696, 0x2beb71a6, 0x9ca9d269, 0x2f0c2471, 0x914017ea, 0xf421a10d, 0x07709cc3},
+       {0xc3bb6a8f, 0x2c8ed622, 0xa2a1a8f2, 0x31c57cb6, 0x4bf6c316, 0x053924d5, 0x09563089, 0x0727b76a},
+       {0x09dc6b5c, 0x567be37f, 0x9476eb5d, 0x57e36f45, 0xee5be5b6, 0xf68488dd, 0x2884c2d7, 0x05ac1ff1},
+       {0x04173760, 0x0fc5b934, 0xda828f00, 0xe43272df, 0x2fad6e9c, 0x7e2ab5fe, 0x0a4995b3, 0x00e0a5eb},
+       {0x42f8ef94, 0x6070024f, 0xe11a6161, 0xad187148, 0x9c8b0fa5, 0x3f046451, 0x87529cfa, 0x005282db}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000},
+       {0x0becc81e, 0xd59e99d9, 0x537cdf25, 0x3ad971a9, 0xbcd60738, 0xaccedf99, 0xd65d66b5, 0x01dafdc6},
+       {0x4bc9ca34, 0xc8e6df6f, 0x5397aaca, 0xab8bfbc5, 0x94813e6e, 0xb5ea6773, 0xe295dda2, 0x0446ed3c},
+       {0x8145aa75, 0xd7981c5b, 0x3d174c52, 0xb14011ea, 0xe4721c1e, 0x647c9ba3, 0x6f6ac6dd, 0x05c3ed0c},
+       {0x6e0bef41, 0x9de8c5cf, 0xcee1b9b0, 0xec349cbb, 0x2121589c, 0xfe72ab05, 0x24c7669c, 0x03b1c96a},
+       {0x246766d8, 0xb878549e, 0xb5a03ab4, 0x8c5d8531, 0x7f1ec75e, 0x334a83ab, 0x46b146d7, 0x01342b29},
+       {0x31055652, 0x8c71bd50, 0x6081f8c3, 0x2eedac49, 0xab013740, 0x25164a76, 0xbca84bf7, 0x05c0a717},
+       {0xd0a6b4f5, 0x1ad37af3, 0x8ca50294, 0x6dc49fe3, 0x5d9529c3, 0x8357a7ff, 0xcefe8efe, 0x02c161bc},
+       {0x296fbf1c, 0x90a5fa7f, 0xc977b113, 0x18226a39, 0xc178262e, 0x9362d5c9, 0x40d28de5, 0x03a362d3},
+       {0x125ca33a, 0x04eeb1c0, 0x8437c604, 0xaa47a4c0, 0xa4d6bafe, 0x064426a2, 0xb8cc76db, 0x00ffbb44},
+       {0x179e2ebe, 0xecf0daf8, 0x2574403b, 0x942e643e, 0x6bf06f7c, 0x684d31aa, 0x244c675c, 0x003b2bde},
+       {0xfeccfccc, 0x96bc19dc, 0x269130b4, 0xbb26f74e, 0xd511649f, 0x15d57a9f, 0x7dcde3c3, 0x02d852a4},
+       {0x44ad0610, 0xb4a47f4c, 0x06fa1b55, 0xdc2f028f, 0xd25979ac, 0xd73ddcd4, 0x076e7f5d, 0x06ba7cbe},
+       {0x349eea63, 0xb0f43dd2, 0x3e64660d, 0x5e64466c, 0xc3bb94ce, 0x7206f426, 0xed4327aa, 0x036cb7c6},
+       {0xf248b36c, 0x6503e80b, 0xe36060ec, 0xb93dd56f, 0x95c2c067, 0x6d3b2763, 0x155023a7, 0x038e7d59},
+       {0xcdf92351, 0x140437ad, 0x2a5ab630, 0xb7a6e1b4, 0xd48175a5, 0xaa80b742, 0xd4afae89, 0x06a50046},
+       {0xaea51997, 0xe8cde2cd, 0x417e3754, 0x612806f6, 0xb940adf4, 0xe40a4a07, 0xa33929b2, 0x063f5efa},
+       {0x0c07573f, 0x0c0926df, 0xd8d4bee3, 0xa84e9027, 0x6bcd79ea, 0xf3776dfa, 0x523f55a8, 0x043a8517},
+       {0x66984d05, 0x5b7e4e45, 0xdb8c30c4, 0xb9381de7, 0xae86e4f6, 0xd7c15128, 0x809daae7, 0x0718f1ad},
+       {0xc1eae1a6, 0xe4fb0a7d, 0xa90a0813, 0xe5484134, 0x895df525, 0x24cca8f9, 0x1cedd2ee, 0x035fd390},
+       {0x82e87775, 0x0a87a942, 0x971f450b, 0x9f2b4b62, 0x8eae6f09, 0x1dc5aecd, 0x1c5686a6, 0x07547fa3},
+       {0x2e35511a, 0x785975cc, 0xa085c456, 0x4266bc82, 0x3abd5bfd, 0x45cf52e1, 0x7bd95ece, 0x019e8e43},
+       {0xae580194, 0xfad72a75, 0x2989ac16, 0xf2bb5a00, 0x55f2b4d0, 0x53fee728, 0x9c7a91e5, 0x02b9f95d},
+       {0x71200963, 0xb0062d2c, 0x1ac57a23, 0xe16e9f91, 0xc4bd9d3e, 0xaae7b169, 0x7f505f35, 0x07462151},
+       {0x57e31913, 0xcf7bd10e, 0x6a4d0ee4, 0x1a360a91, 0x31869e35, 0xb2ba4914, 0x18005db4, 0x07a62d5c},
+       {0xb4344711, 0x431f11e2, 0x6192c47e, 0x0cc3049c, 0xeb9c1bc3, 0x375dff93, 0x42071ee8, 0x03a75790},
+       {0x9ed81498, 0x4eb14251, 0x98b804ef, 0x5852dbc5, 0x56d7f20c, 0xe0c1be13, 0x20d69181, 0x023e7f68},
+       {0xe34f2d55, 0xf2eeb9b5, 0x2aad6f84, 0x63459f16, 0xbe37dbea, 0xf12099e7, 0x11b1a0fd, 0x06e45493},
+       {0x0d6c93ed, 0x63032f6a, 0x5a04829f, 0xd99cbcc8, 0x89608b5e, 0x80f20416, 0x9df329f4, 0x00bf4231},
+       {0x2710f927, 0xc7fc3d1b, 0x90d8503e, 0xc72d19af, 0x9940e689, 0xa9dcd3b8, 0x2da77ac9, 0x06fd386e},
+       {0x08b27bc2, 0xc800035f, 0x4dfacc03, 0xd98987cf, 0x1256e525, 0x24f8fdbf, 0x1f104273, 0x04c575f1},
+       {0x256c604a, 0x68b16e90, 0x6eba097d, 0x7f51023a, 0x1aeba9c8, 0x52c7629c, 0x4809d8da, 0x0575e850},
+       {0x4ac81249, 0x7439d2f9, 0x4fc31ff2, 0x351e4a62, 0xb3906ded, 0x68fb8313, 0x08507a35, 0x007d43d8},
+       {0x98859a12, 0xa87902b8, 0x73af55b3, 0x2f0d13e0, 0x1b9783c2, 0x5a46c66a, 0x2f5f71d4, 0x01045b06},
+       {0x604fce1e, 0x0c379595, 0x7fccc2b4, 0x20ab6eb8, 0xf1820ae7, 0xac0bc709, 0x93fb2b07, 0x07e7654f},
+       {0x246c4bf0, 0xa0e40811, 0x816b15e0, 0xe12accf5, 0x17938138, 0xee417239, 0x2c9a34fb, 0x004e092e},
+       {0xad2cd984, 0x6304351b, 0x4bf1aafc, 0x38546ca6, 0xf310e99f, 0x1fb81192, 0xb5376275, 0x07e89896},
+       {0x7b2d141d, 0xe4376a0b, 0x6dac220c, 0xea1795e5, 0xb19e1901, 0xd778ab50, 0xa94c274f, 0x077df905},
+       {0x16fcd6c7, 0x7039bab1, 0xa6ea1c94, 0x8eececb7, 0x0f122046, 0x84d26ab5, 0x22fd55a1, 0x053c5d48},
+       {0x72f11f65, 0xd43eb7bb, 0xb2a566d6, 0xfb538785, 0x3f35cbf5, 0xccc2cdc6, 0x7112504a, 0x06df5a9e},
+       {0x60ce9c30, 0x75efb55c, 0x3c541437, 0x991873ed, 0xdf0cbb3b, 0x37eaedcb, 0xb04c2858, 0x0278d7f0},
+       {0x1a06866b, 0x5757dd4e, 0x6570fa7f, 0x15c176b1, 0xafe89a1d, 0x9981b57f, 0xee0cb14c, 0x03c57f4d},
+       {0x503c31cd, 0x3438cd66, 0xc0736d4b, 0x34437e52, 0x2a9d1b28, 0xe825b769, 0x73c06ee7, 0x06955a3a},
+       {0x5c5e530e, 0xbbf0995a, 0x6569a2f9, 0xdee304b3, 0x5bd1a886, 0x3b9c993c, 0xc9cd050a, 0x00f66017},
+       {0xee755737, 0x3666e752, 0x74d0e317, 0xa13bfafc, 0x01d2f1bf, 0x17ab672a, 0x0778f525, 0x079dde3a},
+       {0xed8a25e9, 0x96a003c2, 0x8f347cec, 0x45d258fe, 0x96ea14ac, 0x68ff148d, 0xe148eda9, 0x058f4ec7},
+       {0xe2a700ab, 0x23baf732, 0x5202a945, 0x6434725a, 0x2e693363, 0xa19a338d, 0xbf2f39c6, 0x01d0ea7a},
+       {0x3ab52589, 0x5e571cad, 0x92240361, 0xe2916bb2, 0xdff5e354, 0xe6f8897b, 0x2ffa4707, 0x02a62880},
+       {0xef649a85, 0xaf446c62, 0xed4e461f, 0x14d8072f, 0x59993efa, 0x5a07f4e5, 0x72a3a652, 0x00dc28b6},
+       {0xf21511df, 0x139299d7, 0x4854ebc3, 0x8914e707, 0xbfd102a9, 0x9f3b5913, 0x3a5af894, 0x009dc24f},
+       {0x1f4ba4fa, 0x650e1d91, 0x1977bff0, 0x6ba67806, 0xaa9bbc1b, 0xffbdc531, 0x997408aa, 0x057b69b2},
+       {0x65fb1a91, 0x25c03e81, 0x7fd22618, 0x8682f98b, 0xf46cb453, 0xcad67f13, 0x5a80e5c6, 0x060ca599},
+       {0x94188f2a, 0xa7978a90, 0xdbb9338e, 0xd5fc8f0b, 0xcbdd84f0, 0xf8387e6d, 0xbbc743a3, 0x073ae131},
+       {0x0415bbcc, 0xafd00c46, 0x0df4a52a, 0x1a00eb6c, 0x0b96b594, 0x1ec67c64, 0x8e26b699, 0x01cb82a5},
+       {0x7f740f93, 0xf56319fb, 0x2e2f6ed7, 0xb40d559b, 0x75e19784, 0x63f96f04, 0xc31ba061, 0x06406929},
+       {0xfa5a3239, 0x22349e8b, 0xb9ca6bf9, 0xe1236395, 0x9b0017a4, 0x76ae5a8b, 0x17b7af03, 0x06cfb4ce},
+       {0xb51abfe6, 0x34938785, 0x1249edb6, 0x21f54c80, 0xab038972, 0x3bd1cc16, 0xa4a57a81, 0x0636b37f},
+       {0xf88717cf, 0xfda4a9a1, 0xee19d402, 0xf8fcba35, 0x47c9ba1b, 0x1ac940f6, 0xdd991440, 0x013c0ab3},
+       {0x3743adf4, 0x5082318a, 0x22440f94, 0x3293bae1, 0x8dd2d761, 0x4c2e6d7f, 0xcdc38c82, 0x07124118},
+       {0x76198779, 0xb031f8b7, 0x1b6c1944, 0x6742f602, 0x894a6134, 0xa18290db, 0xaba037dc, 0x035289d8},
+       {0x9f8a9b07, 0x4579e855, 0x4dca3764, 0x1e580662, 0xb8c8ef49, 0xda92152e, 0x8b54508a, 0x0444085a},
+       {0x34696648, 0x7f670ce1, 0xc05768d9, 0x2f00108f, 0x390fb519, 0x2d00a444, 0x1cd6f914, 0x015c468b},
+       {0xfe46c5f2, 0x00666cbf, 0x9f7174d6, 0xca4051c5, 0x8e4277f4, 0x1629882a, 0x6ee002a3, 0x00b3f261},
+       {0xc1dbb4f6, 0x418a2b86, 0x9a6ca270, 0x9f453ccc, 0x1d457b20, 0x1966471f, 0x80fd1319, 0x00b4d831},
+       {0x1c76c8b1, 0xa12f86a8, 0xc0125e48, 0x2772e424, 0x1459dfb8, 0x8d650644, 0xad06d01c, 0x02128e5c},
+       {0x3472799c, 0xcc8cc7f6, 0x2f511cae, 0xfbd97f95, 0x5ebbff71, 0xadd8818b, 0x09af0983, 0x00520540},
+       {0x8ec654cc, 0xcaab5dd4, 0x17ba15a9, 0xc05ad0a7, 0x36300a00, 0x4bda7469, 0x41bb0610, 0x02e486cd},
+       {0x2d6be8b5, 0x077ba983, 0xfe89eb7d, 0xdd5e728f, 0x63f9c51f, 0xe3c872fb, 0xce639995, 0x01f2f7a8},
+       {0xaa2ea7eb, 0xd82b1599, 0xa16489e0, 0x1be5d254, 0x173d3219, 0x19cb236a, 0x1fe63b23, 0x007dd45f},
+       {0x19dba628, 0xa27cc4d3, 0x5fd2e061, 0xf04ac441, 0x9307a758, 0xc7405333, 0x28c40fe4, 0x0103c707},
+       {0x54662aab, 0xb5129fd1, 0x59158f32, 0x2ec5b69b, 0x12c44eec, 0x6c7e6492, 0xe527abb2, 0x046e7c11},
+       {0xe32d46fe, 0xb9bf4936, 0xb08ef006, 0xf23ae18c, 0xe6a5179e, 0x5352cc59, 0x5bf7c0b8, 0x0753a621},
+       {0x9318db3a, 0x19f65bc2, 0x7e3d0014, 0x93ff3f79, 0x6beb580d, 0xf7f93c7f, 0xddd72603, 0x04fdb898},
+       {0xe184a935, 0xf7e1f88f, 0x1ad510f0, 0x82a0f047, 0x4c9ab6ca, 0xce0f7c44, 0x5104a95a, 0x0552304e},
+       {0x985bba5c, 0x06615580, 0xf487a1fb, 0x8ccd29a8, 0xeecf758d, 0xb3e15ed0, 0x857ce648, 0x05328783},
+       {0x6cb042b0, 0x5d1d5a22, 0x0277083c, 0x64375cf4, 0x5fa82215, 0xe8947dab, 0x86932495, 0x05e72829},
+       {0x8c3e2849, 0x5bf6f46a, 0x4924c8f4, 0x7e40314c, 0xdffd6118, 0x3c74a4ba, 0x2f8de20a, 0x05247cdd},
+       {0xd0042d11, 0x25a418c5, 0x2f7da60c, 0x1b60ee9f, 0x02c0b69f, 0x61c041ad, 0x15670214, 0x0632d33a},
+       {0x90e05a92, 0x32b03a5e, 0x78d1e8d6, 0xfb12a1b1, 0x5bc2f5d5, 0xb8af534e, 0xa032918a, 0x05ab4772},
+       {0x0a711a9d, 0x096878a8, 0x6b083c8c, 0x87d070da, 0x87d06afb, 0x77931578, 0xf3104057, 0x03705277},
+       {0xdf993e46, 0x502d2374, 0x35baf646, 0xc1cd2868, 0xe30aa213, 0xa61b54b6, 0xbce34b74, 0x02511017},
+       {0x90a6b9b9, 0xcfb6c51a, 0x8be6ade8, 0x4e0b29ef, 0xd3832d74, 0xa8292467, 0x41ca1e45, 0x02ce7977},
+       {0x3e672d5b, 0x25ee10aa, 0x28597504, 0xb0e60c63, 0xe263c827, 0x4a8d0567, 0xfadefeba, 0x01f4ec42},
+       {0xa5a26158, 0x8b4b15e0, 0x88a71cf2, 0xa59b2df9, 0x5d734341, 0xde44f2e7, 0x4db8d2e8, 0x007a18a0},
+       {0xb4d18100, 0x30fcf001, 0xf8ae0b4f, 0xcdaa5334, 0xe325615a, 0x67017b2b, 0xf0ccbf57, 0x016c6d47},
+       {0xba937732, 0x66afc115, 0xc20be386, 0x917d4890, 0xa017c59d, 0x5dadccff, 0x986c39c1, 0x043fa44e},
+       {0x08baa72a, 0xc57ec886, 0x052364ed, 0xe65a4680, 0x85f9a523, 0x0536b505, 0xfe744ee2, 0x03580609},
+       {0x1bab1ab8, 0x88109415, 0x62f0fa74, 0x02244b19, 0x915618e0, 0x837fcd10, 0x942f12d2, 0x061b83d0},
+       {0x687b7798, 0x823d0bba, 0x84a49784, 0x5f93174a, 0x2574af37, 0xcfd64159, 0xe108057c, 0x0290722e},
+       {0x58a66036, 0x900a7031, 0x6153c2ae, 0xcb443378, 0xa6ccdffe, 0x4c48b8dd, 0xa06e955a, 0x049a9211},
+       {0xea0b9dd9, 0x1b034532, 0x638c79ec, 0x11cba08f, 0x7c5b2d15, 0x16d00728, 0xbb9a759c, 0x05abcbcd},
+       {0x1552d6af, 0x21b4f60e, 0xbed54865, 0x2f7ea9d2, 0x738befdb, 0x39378802, 0x97845360, 0x02adf76c},
+       {0x4026bb92, 0x6e5eb2ca, 0xcbed5570, 0x18f3d8bf, 0xb655ac26, 0x2a5fc8cd, 0x3809a1c5, 0x0031cd25},
+       {0x0ef5e011, 0x2d698950, 0xc018b82d, 0xc0668c45, 0xf520d325, 0xd180ff47, 0xa38122b1, 0x046714c7},
+       {0x12df2cc7, 0x8dec8a4b, 0x963031f8, 0x5eb84a1b, 0x88525708, 0xb75ad701, 0x07df57bd, 0x02054a99},
+       {0x82b2f616, 0xe0013d43, 0x7b385914, 0x2ad34c97, 0x11108f4b, 0xc9969223, 0x9c9fad59, 0x0183f639},
+       {0x06b4dc38, 0xaca9dfbc, 0x962d5774, 0x85596bbc, 0x22f1cd7d, 0xd7023923, 0x2067b180, 0x04d3c939},
+       {0xe4004173, 0x6d13e6ab, 0xaafe8726, 0x3495d095, 0x33dc3303, 0xa22d3e4a, 0x776d2e14, 0x0276dbb2},
+       {0x68c539b6, 0xa03f83cb, 0x7b42a06e, 0xfd3fa839, 0xe8d45ac3, 0xea0f1f15, 0xa414b012, 0x061adb94},
+       {0xb33fb188, 0xd22fc6e3, 0xf723dc18, 0xbebc7978, 0xf6c99f34, 0xa874b584, 0xf67ff454, 0x049beb53},
+       {0x754bed16, 0x7c247948, 0xe50eac10, 0x4a84bcfb, 0xade97580, 0xc00d65df, 0xca79c5ae, 0x0763d73c},
+       {0x7aadbe1a, 0x696e27af, 0x9d8e2a1f, 0x113535e0, 0x4c011766, 0x6953003f, 0xbb52558c, 0x0498a75f},
+       {0x6e09cee7, 0xcf26e897, 0x299b63c7, 0x813a76f2, 0x0939904c, 0x67c02fa7, 0x7e0b9483, 0x045c41a9},
+       {0x4af5adcc, 0xad979914, 0xc2c7c068, 0x7d9267f9, 0x21b4a0a7, 0xda4fa3f8, 0x3386c423, 0x03f4bcc9},
+       {0xd1228595, 0xe5fcd634, 0x12fc8b7c, 0x5571b994, 0x244857f8, 0xd50dcd33, 0x263b93f0, 0x060dc1d6},
+       {0xfee59c89, 0x7040a236, 0x78ceb168, 0x91a4301b, 0x19cdb36a, 0x973b55bd, 0x71008400, 0x06a1c58e},
+       {0x6af1f351, 0x1d3c7ad7, 0xe8ad24dc, 0x8493c0c1, 0x48d5ffd9, 0x076f9dea, 0x5931555f, 0x00b9b2bf},
+       {0xeaa5731c, 0xa3d54d89, 0xba84ee02, 0xfcc41a45, 0xcc1cdac8, 0x7c828f73, 0x5bfe9d23, 0x009c426b},
+       {0x3f1f352c, 0x36fb314c, 0x9feb1120, 0x750a2a5f, 0xd7b06171, 0x3a2f19e8, 0x3b550cd9, 0x06de1885},
+       {0xb69183f6, 0xefc03237, 0x979ee075, 0xb5a14fc3, 0x2dcb1d51, 0xbf114125, 0xb8eca2d3, 0x062364f7},
+       {0x95375861, 0x575f1ea7, 0x80cc8dba, 0x30608586, 0xcf7a8f9f, 0x2beca9f5, 0x5fe60da4, 0x00dfc078},
+       {0x0f86ded5, 0x312928eb, 0xb9c4f0cc, 0x646f5d3e, 0x2fbf14dd, 0x23c69382, 0xc44caa0e, 0x023aae90},
+       {0x13e16243, 0xa7c92faf, 0x92efd5fc, 0x035a3e75, 0x86a744ea, 0x32f44d08, 0x1ea28333, 0x05b45217},
+       {0xc41fdf22, 0xb557d203, 0x4bbc8f76, 0x9697570c, 0x81eaf742, 0x3a6a2cb5, 0xb0d03a0f, 0x07f2c08a},
+       {0x2a18b73a, 0xca806385, 0xdb6a953d, 0xf2015d6d, 0xba5f67b9, 0x51d21a8e, 0x14807dd6, 0x051439d5},
+       {0xf75051de, 0x7b6e0c13, 0x14dd1aa0, 0x114681fb, 0x0fd95a37, 0x72a1cccc, 0xa39e5bb8, 0x02f29d4c},
+       {0x116529cd, 0x4808a0de, 0x5b941d1c, 0x1cf38580, 0xd70796f7, 0xc96a451e, 0x3f24e64f, 0x016d083f},
+       {0x3cf155ee, 0xc71b78d0, 0x0c361b67, 0x0c04a134, 0x7756e4a9, 0xdb546edc, 0x2988eb2c, 0x03474404},
+       {0xf30cef17, 0x1a0b3585, 0x864abd80, 0x63c1de29, 0xc0687c8e, 0x0c171d6e, 0xc9763a97, 0x0353aec8},
+       {0x94192fb8, 0x0a2c9cff, 0x1a7f5bbf, 0x27320b93, 0xe5ceeb75, 0x465d2f9f, 0xd78f1cc3, 0x07ce6f99},
+       {0xe8d1b26d, 0x0f899233, 0xb87a2984, 0xed4b44d2, 0x0bd6354a, 0x0c0712c6, 0xc7032f5c, 0x01eb2a31},
+       {0x46b03b57, 0xc4c03fbd, 0x785ebbe8, 0x989b0ff3, 0x7f0bcb19, 0x5cada62a, 0xa97557c9, 0x01426410},
+       {0x96fb0a26, 0xf1d2e82b, 0x1edb9ce3, 0xe270bc10, 0xfc7aaed8, 0x9549cfd0, 0xd90d7c9c, 0x03e8256c},
+       {0x43ac9984, 0x14eef0ee, 0xa16d6770, 0x2903ff22, 0xa38fbfc0, 0xc66c2690, 0x8755440e, 0x0032a202},
+       {0xf3601782, 0x46a07cf2, 0xaa71d137, 0x79f410f9, 0x8bcabc59, 0xc320c6f1, 0xf8ab64d8, 0x00a706cf},
+       {0x8dbd8d4f, 0x8848a9f0, 0x0085061d, 0xeff89e69, 0xfee62fbe, 0x90e634a7, 0x2ffb456b, 0x03983046},
+       {0xb272ed5c, 0x91ec28a8, 0xdc0cbb77, 0xf8529918, 0x3648d2c5, 0x8f896ddb, 0x74edaf19, 0x0668a86c},
+       {0x128c9bd9, 0x341d5fc8, 0x6b3241c5, 0x592f87d8, 0xb2cc3c97, 0xf8cba6f2, 0x03f396ed, 0x03463bf1},
+       {0xafd9d239, 0xcf3ae525, 0xea20b753, 0x06b8b7b9, 0x3408a993, 0xb2be1e49, 0x9f47063f, 0x02bcb200},
+       {0xa0bd0bc8, 0x7ca02722, 0xb862774d, 0xce8b32ee, 0x5f8da059, 0x424ba5f0, 0x3bb422a0, 0x05c81961},
+       {0x32fd8907, 0x137dad8c, 0xc95a3a5d, 0x301d5119, 0x8937ac08, 0x144b38c3, 0x39338de7, 0x00e66f0e},
+       {0xcfc10885, 0xe68b8875, 0x96147e68, 0x4f24d49a, 0x43032c15, 0x5da9e6fd, 0x9bf25e12, 0x061ab0e6},
+       {0x455c65ad, 0xeab29bbd, 0x2448be64, 0x1c7da0e7, 0x8eedfa1f, 0x8c2c1bcd, 0x698c1197, 0x0400e2d2},
+       {0x04549c13, 0x335d3e9e, 0xd31585cc, 0x546f0d82, 0xe16dbbac, 0x350d5ed5, 0x113c53fd, 0x05f77544},
+       {0x7d8f3b7e, 0x6aa75c04, 0x10a641ae, 0xc70851dd, 0x9a0750fe, 0x4d33edd4, 0xcd1b230f, 0x022802cf},
+       {0xef8170e3, 0x59fa1903, 0x62995788, 0x464a73ef, 0x13369717, 0x338be7fd, 0x52d21278, 0x02e97589},
+       {0x4856ddd5, 0x3f2deca8, 0xfced10e2, 0x969b10e2, 0x52860ee7, 0x09620dde, 0xb620fa3f, 0x04a169bf},
+       {0xa03b49f1, 0xd9beb712, 0xe9af606e, 0x0798af09, 0x63e70b9a, 0xe37f9aea, 0xb35abd7c, 0x02542a44},
+       {0xf6e78973, 0x335d4000, 0x76f1bb23, 0x7bc28fde, 0x1b30e9ca, 0x6cfdc907, 0x0400b651, 0x03ff88aa},
+       {0x36433eaf, 0xfb862981, 0x4111cfa3, 0x15fdc659, 0xeab2909d, 0x569574b9, 0x3cd80f84, 0x01442360},
+       {0xe85c4af3, 0xa8ed8f31, 0xe6aaf3da, 0xf7680fee, 0xc5c1772c, 0x2240e931, 0xaebeeb70, 0x04f44f6f},
+       {0x8846e0af, 0x29de323f, 0x42c25319, 0x33f91593, 0x6cbadd58, 0x863099c1, 0xfd83e5b3, 0x06a603cf},
+       {0x86c77703, 0x1bdd17f3, 0xe02db671, 0x8cee8e78, 0x0b6dffce, 0xed1627af, 0xa0d9b3cc, 0x04491984},
+       {0xcb583661, 0x177f8f9c, 0x73d05bfc, 0x54122d0c, 0xebe37b4a, 0xa9231660, 0xd4826038, 0x06e885db},
+       {0x13c253b9, 0x64cde875, 0x2fbc98a9, 0x8484bccb, 0x4885a9af, 0xbad877c5, 0x0cbc33b6, 0x03007c90},
+       {0x47cfa357, 0x41eb9173, 0x325309ad, 0xb3f06289, 0xaa85421b, 0x029da7c1, 0x84de4bd4, 0x07b7eb0d},
+       {0x56b831e2, 0x2c459a80, 0x321aba19, 0x2b99d098, 0xea73c0e1, 0x96237364, 0xe25ed0ed, 0x02f2c638},
+       {0x9b388bf4, 0xfc8c3228, 0x82cd081d, 0xa4c371e4, 0xc85f75df, 0x11239026, 0x8892896e, 0x01f01c5e},
+       {0x73457917, 0xce1dde59, 0x16dd8b49, 0xdfdaeb19, 0xbfd17b1e, 0x4289a976, 0xc842870a, 0x05e2cf7e},
+       {0xc7705532, 0x72faa825, 0x8f7fe8c2, 0xd24bf942, 0xb695e31b, 0xb7403e13, 0xfc85a0c6, 0x02eac9e7},
+       {0x1ddb2dff, 0xc47638e3, 0x799bb649, 0x78b91a13, 0x552588ed, 0x001800de, 0x9cd9425c, 0x01d0640c},
+       {0xfb431e10, 0x159891e7, 0xa012b461, 0x2f2fb29a, 0xb3333e5d, 0xc1dca804, 0x9a47200d, 0x05b918ec},
+       {0x2d5ce760, 0x379119b5, 0xda2ccdab, 0xf9911f75, 0x47b5c054, 0x92b09490, 0x7298d065, 0x0742a31e},
+       {0x4a73d1f1, 0xe2a1046b, 0xc6ab4d9c, 0xbc85a747, 0xba0701f8, 0x79b0e699, 0xeebc6762, 0x05e5c2cb},
+       {0xe0c0db50, 0xdc644b37, 0x2b8444d2, 0x26f7f083, 0x63479a84, 0x90acf2e7, 0x90ffe372, 0x0590d880},
+       {0x83c0fc9c, 0x3dd1aba4, 0xcfb43020, 0x30a1051f, 0xaf5be716, 0x7d1ca380, 0x1ed8aed9, 0x01d56947},
+       {0x0fa23690, 0x657df8c4, 0x32111be3, 0x61a12fe4, 0xe78236c9, 0xd6cc9942, 0x85e66191, 0x01709635},
+       {0xc6a054f0, 0x96bf35ed, 0x004113cc, 0x9d1e411a, 0x1ac7a3ec, 0xccdb9bc3, 0xd08016b8, 0x07362425},
+       {0x9721b035, 0x72744cce, 0x0beb72e3, 0xb87eb606, 0x60870c2e, 0x00c5e70c, 0x685d7c14, 0x029fa4d3},
+       {0x86e52af4, 0x06d3a7a3, 0x70020878, 0x7b1c814a, 0x52e68007, 0x44373cb7, 0xe403540f, 0x041cf8c0},
+       {0x76a27949, 0xd5dbc8bf, 0x27d9cd12, 0xb41449bc, 0xa7a667a1, 0x93740020, 0x0fbb4e77, 0x000bf807},
+       {0x9969cfe9, 0x274ce281, 0x259ec27c, 0x3234d283, 0xe0b44f04, 0x9ff85b71, 0xffcc1006, 0x0298d060},
+       {0x68ab54f8, 0x5cd8b289, 0x437eaab8, 0x42e3877f, 0x9318bd3e, 0x6490dc61, 0x4e54d968, 0x075b01f3},
+       {0x7b64243c, 0x73100d65, 0x5c802f82, 0x692378be, 0x88184c0c, 0x00283dbb, 0xab6f4f0e, 0x0442efad},
+       {0x72015722, 0xbe83b708, 0xe1cdcf0e, 0x2035319f, 0x398347da, 0x2b1b3351, 0x1a14b8dc, 0x061823d8},
+       {0x378d9803, 0x1090948c, 0x4725c64b, 0x61a558cc, 0x7d7fcd91, 0x9e5bd3b5, 0x57ebda25, 0x061e02a0},
+       {0xf8324dc8, 0x166b4a3c, 0x38133fda, 0xa25b9d11, 0x917171a5, 0x9d602950, 0x417d104e, 0x0632e48b},
+       {0x6a61d5e0, 0x03b9f1b9, 0xe59cfbb7, 0xd906b740, 0x7892fbe4, 0x99a93267, 0xad1b8171, 0x06ddc2a6},
+       {0x67fc3874, 0x6ae4355d, 0xb1ada695, 0x4fa456d8, 0x9f91ac43, 0x4e234065, 0x829d173e, 0x028da309},
+       {0xfc695c2c, 0x1e08dd18, 0xfa687112, 0x1c0a2fad, 0xffd6302a, 0xeb5ebf01, 0xfd1d10f5, 0x012fd387},
+       {0x236e65c9, 0x0b907f2e, 0xb1281d54, 0x92ba7a15, 0xc13f1d75, 0x07f0a6ad, 0xcd6d1e9c, 0x05dfe4e3},
+       {0xc45f33f8, 0xd99cc41a, 0xd373165c, 0xc1c10a71, 0x2ce2936a, 0x6c809230, 0xa0498cf5, 0x018dc832},
+       {0x7b222ad8, 0x8e881eab, 0xb6194efb, 0xc8b48774, 0x963c6b6b, 0x38452dfd, 0xe4c4e0f8, 0x02847f5a},
+       {0x2bf4ad95, 0x2950bb4a, 0xdc39ffb0, 0x37f42c9b, 0x101253a8, 0x3814fa42, 0xb67f2ca5, 0x04d4a34c},
+       {0xa9684ba0, 0x6c40fece, 0x3b13bca4, 0xc7108aad, 0xe7bff9be, 0x98ccc7ea, 0xe9b3b316, 0x048b3a6a},
+       {0x08390a2b, 0x4d908260, 0x74b070bc, 0xd5a641d0, 0x910015c5, 0xc3b19274, 0xd5a998a7, 0x02ac8e74},
+       {0x9698d605, 0x8de03acc, 0xa4c9137f, 0x3b8b720c, 0x354faf46, 0x5bbad6e4, 0xfd9e842f, 0x0054c120},
+       {0xd65aead5, 0x305fa33f, 0x0fe296f9, 0xba02b164, 0x708efc94, 0x64cba43c, 0x8ad7f0ef, 0x034b9ffe},
+       {0x13c2e8f4, 0x59e1179e, 0xc572f8a8, 0x5d823d59, 0x74003bce, 0x0cfdb6ee, 0x011c179e, 0x00763941},
+       {0xa47999a8, 0x29b692ee, 0xbfcd80d8, 0x6436c3f1, 0x959768d7, 0x553444f3, 0x583896d4, 0x01d45a26},
+       {0xc150b3f8, 0x0ce0791d, 0xf493c135, 0x7d3a0c1f, 0x5ede0712, 0x4d37cc23, 0x34fbae9c, 0x036a6a38},
+       {0x2ca1eb78, 0xa8ee8204, 0x66d8b759, 0xc713a1dc, 0xac061800, 0x1813508d, 0x3b1f0da2, 0x05725ca0},
+       {0xf2f391c1, 0xbe6826df, 0x232878f0, 0xeb85b046, 0xf7e1d662, 0xf5a96510, 0xe38c2b64, 0x0419a43b},
+       {0xe69e791b, 0x4b54889b, 0xb5c95ea5, 0xb371eeb0, 0x0b2f26a3, 0x9f53ccca, 0x66f45f71, 0x0040592d},
+       {0xad2e5d5b, 0x4ced12db, 0x0987b849, 0x5f57b16d, 0xd9ec045b, 0xcab0e2e9, 0x6cfbf4df, 0x03e4e405},
+       {0x3ecb72a4, 0xd71a1eee, 0x03a13fb7, 0x6bd9f7ec, 0x5877c6c7, 0xb74a54c8, 0xa28236a5, 0x0377689b},
+       {0x74b3354c, 0x6f558a20, 0x3f776b18, 0xb67f6d10, 0x01165ed8, 0x8c447df2, 0xf3889308, 0x056b8991},
+       {0x0d306b7a, 0x9482eb10, 0xd441cd03, 0xdd738e0f, 0x2de5dfd7, 0x6d186de5, 0x75fd1833, 0x00781b3e},
+       {0x77ec28e5, 0xdbc14748, 0xd26e050c, 0x02ceee41, 0x18457c96, 0x8e5aef74, 0x1823c60f, 0x0461a6e2},
+       {0x2be17c8b, 0x172e551d, 0x49c6a7b8, 0x90e25fa2, 0xa1b3478f, 0x6219e63e, 0xd063a517, 0x00c412f8},
+       {0x65a9b68e, 0xb136b848, 0x673c6cbc, 0x9a9b7169, 0xf8ec7473, 0x15fa1875, 0x3033a5d6, 0x022d72f6}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000, 0x00000008, 0x04000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xc0000000, 0x0000000c, 0x06000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xe0000000, 0x0000000e, 0x07000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xf0000000, 0x0000000f, 0x07800000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x78000000, 0x00000010, 0x07c00000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xbc000000, 0x00000010, 0x07e00000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xde000000, 0x00000010, 0x07f00000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xef000000, 0x00000010, 0x07f80000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xf7800000, 0x00000010, 0x07fc0000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfbc00000, 0x00000010, 0x07fe0000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfde00000, 0x00000010, 0x07ff0000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfef00000, 0x00000010, 0x07ff8000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff780000, 0x00000010, 0x07ffc000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffbc0000, 0x00000010, 0x07ffe000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffde0000, 0x00000010, 0x07fff000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffef0000, 0x00000010, 0x07fff800},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff78000, 0x00000010, 0x07fffc00},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffbc000, 0x00000010, 0x07fffe00},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffde000, 0x00000010, 0x07ffff00},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffef000, 0x00000010, 0x07ffff80},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffff7800, 0x00000010, 0x07ffffc0},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffbc00, 0x00000010, 0x07ffffe0},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffde00, 0x00000010, 0x07fffff0},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffef00, 0x00000010, 0x07fffff8},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffff780, 0x00000010, 0x07fffffc},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffbc0, 0x00000010, 0x07fffffe},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffde0, 0x00000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffef0, 0x80000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffff78, 0xc0000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffbc, 0xe0000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffde, 0xf0000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffef, 0xf8000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x80000000, 0xfffffff7, 0xfc000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xc0000000, 0xfffffffb, 0xfe000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xe0000000, 0xfffffffd, 0xff000010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xf0000000, 0xfffffffe, 0xff800010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x78000000, 0xffffffff, 0xffc00010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xbc000000, 0xffffffff, 0xffe00010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xde000000, 0xffffffff, 0xfff00010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xef000000, 0xffffffff, 0xfff80010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xf7800000, 0xffffffff, 0xfffc0010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffe0010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfde00000, 0xffffffff, 0xffff0010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfef00000, 0xffffffff, 0xffff8010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xff780000, 0xffffffff, 0xffffc010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffe010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffde0000, 0xffffffff, 0xfffff010, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff810, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffc10, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffe10, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffde000, 0xffffffff, 0xffffff10, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff90, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffd0, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffbc00, 0xffffffff, 0xfffffff0, 0x07ffffff},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffde00, 0xffffffff, 0x00000000, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffef00, 0xffffffff, 0x00000008, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffff780, 0xffffffff, 0x0000000c, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffbc0, 0xffffffff, 0x0000000e, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffde0, 0xffffffff, 0x0000000f, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffef0, 0x7fffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffff78, 0xbfffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffbc, 0xdfffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffde, 0xefffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffef, 0xf7ffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x80000000, 0xfffffff7, 0xfbffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xc0000000, 0xfffffffb, 0xfdffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xe0000000, 0xfffffffd, 0xfeffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xf0000000, 0xfffffffe, 0xff7fffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0x78000000, 0xffffffff, 0xffbfffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xbc000000, 0xffffffff, 0xffdfffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xde000000, 0xffffffff, 0xffefffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xef000000, 0xffffffff, 0xfff7ffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xf7800000, 0xffffffff, 0xfffbffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffdffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfde00000, 0xffffffff, 0xfffeffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfef00000, 0xffffffff, 0xffff7fff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xff780000, 0xffffffff, 0xffffbfff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffdfff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffde0000, 0xffffffff, 0xffffefff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff7ff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffbff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffdff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffde000, 0xffffffff, 0xfffffeff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff7f, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffbf, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffbc00, 0xffffffff, 0xffffffdf, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffde00, 0xffffffff, 0xffffffef, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffef00, 0xffffffff, 0xfffffff7, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffff780, 0xffffffff, 0xfffffffb, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffffde0, 0xffffffff, 0xfffffffe, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xfffffef0, 0x7fffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffff78, 0xbfffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffffbc, 0xdfffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffffde, 0xefffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x00000000, 0xffffffef, 0xf7ffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x80000000, 0xfffffff7, 0xfbffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xc0000000, 0xfffffffb, 0xfdffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xe0000000, 0xfffffffd, 0xfeffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xf0000000, 0xfffffffe, 0xff7fffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0x78000000, 0xffffffff, 0xffbfffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xbc000000, 0xffffffff, 0xffdfffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xde000000, 0xffffffff, 0xffefffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xef000000, 0xffffffff, 0xfff7ffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xf7800000, 0xffffffff, 0xfffbffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffdffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfde00000, 0xffffffff, 0xfffeffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfef00000, 0xffffffff, 0xffff7fff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xff780000, 0xffffffff, 0xffffbfff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffdfff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffde0000, 0xffffffff, 0xffffefff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff7ff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffbff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffdff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffde000, 0xffffffff, 0xfffffeff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff7f, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffbf, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffbc00, 0xffffffff, 0xffffffdf, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffde00, 0xffffffff, 0xffffffef, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffef00, 0xffffffff, 0xfffffff7, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffff780, 0xffffffff, 0xfffffffb, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffffde0, 0xffffffff, 0xfffffffe, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xfffffef0, 0x7fffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffff78, 0xbfffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffffbc, 0xdfffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffffde, 0xefffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x00000000, 0xffffffef, 0xf7ffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x80000000, 0xfffffff7, 0xfbffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xc0000000, 0xfffffffb, 0xfdffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xe0000000, 0xfffffffd, 0xfeffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xf0000000, 0xfffffffe, 0xff7fffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0x78000000, 0xffffffff, 0xffbfffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xbc000000, 0xffffffff, 0xffdfffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xde000000, 0xffffffff, 0xffefffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xef000000, 0xffffffff, 0xfff7ffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xf7800000, 0xffffffff, 0xfffbffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfbc00000, 0xffffffff, 0xfffdffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfde00000, 0xffffffff, 0xfffeffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfef00000, 0xffffffff, 0xffff7fff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xff780000, 0xffffffff, 0xffffbfff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffbc0000, 0xffffffff, 0xffffdfff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffde0000, 0xffffffff, 0xffffefff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffef0000, 0xffffffff, 0xfffff7ff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfff78000, 0xffffffff, 0xfffffbff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffbc000, 0xffffffff, 0xfffffdff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffde000, 0xffffffff, 0xfffffeff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffef000, 0xffffffff, 0xffffff7f, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffff7800, 0xffffffff, 0xffffffbf, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffbc00, 0xffffffff, 0xffffffdf, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffde00, 0xffffffff, 0xffffffef, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffef00, 0xffffffff, 0xfffffff7, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffff780, 0xffffffff, 0xfffffffb, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffffde0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xfffffef0, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffff78, 0xbfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffffbc, 0xdfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffffde, 0xefffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x00000001, 0xffffffef, 0xf7ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x80000001, 0xfffffff7, 0xfbffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xc0000001, 0xfffffffb, 0xfdffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xe0000001, 0xfffffffd, 0xfeffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xf0000001, 0xfffffffe, 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0x78000001, 0xffffffff, 0xffbfffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xbc000001, 0xffffffff, 0xffdfffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xde000001, 0xffffffff, 0xffefffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xef000001, 0xffffffff, 0xfff7ffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xf7800001, 0xffffffff, 0xfffbffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfbc00001, 0xffffffff, 0xfffdffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfde00001, 0xffffffff, 0xfffeffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfef00001, 0xffffffff, 0xffff7fff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xff780001, 0xffffffff, 0xffffbfff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffbc0001, 0xffffffff, 0xffffdfff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffde0001, 0xffffffff, 0xffffefff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffef0001, 0xffffffff, 0xfffff7ff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfff78001, 0xffffffff, 0xfffffbff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffbc001, 0xffffffff, 0xfffffdff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffde001, 0xffffffff, 0xfffffeff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffef001, 0xffffffff, 0xffffff7f, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffff7801, 0xffffffff, 0xffffffbf, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffbc01, 0xffffffff, 0xffffffdf, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffde01, 0xffffffff, 0xffffffef, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffef01, 0xffffffff, 0xfffffff7, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffff781, 0xffffffff, 0xfffffffb, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffffbc1, 0xffffffff, 0xfffffffd, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffffde1, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffffef1, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffff79, 0xbfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffffbd, 0xdfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xffffffdf, 0xefffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
+       {0xfffffff0, 0xf7ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000}}};
+  };
+
+  /**
+   * Scalar field. Is always a prime field.
+   */
+  typedef Field<fp_config> scalar_t;
+} // namespace stark252
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -50,7 +50,7 @@ namespace keccak {
   */
  template <int C, int D>
  cudaError_t
-  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
+  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config);
 } // namespace keccak

 #endif
--- a/icicle/include/msm/msm.cuh
+++ b/icicle/include/msm/msm.cuh
@@ -43,7 +43,7 @@ namespace msm {
                              *   points, it should be set to the product of MSM size and [batch_size](@ref
                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
+                              *   [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
                              *   there needs to be equal to the one used here. Larger values decrease the
                              *   number of computations to make, on-line memory footprint, but increase the static
                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
@@ -52,7 +52,7 @@ namespace msm {
                              *   means more on-line memory footprint but also more parallelism and less computational
                              *   complexity (up to a certain point). Currently pre-computation is independent of
                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
+                              *   [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
                              *   but if a different (better) upper bound is known, it should be reflected in this
@@ -127,6 +127,26 @@ namespace msm {
  template <typename S, typename A, typename P>
  cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);

+  /**
+   * A function that precomputes MSM bases by extending them with their shifted copies.
+   * e.g.:
+   * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
+   * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
+   * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
+   * @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+   * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
+   * the size of 1 MSM.
+   * @param config [MSMConfig](@ref MSMConfig) used in this MSM.
+   * @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended
+   * points.
+   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   *
+   */
+  template <typename A, typename P>
+  cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points);
+
  /**
   * A function that precomputes MSM bases by extending them with their shifted copies.
   * e.g.:
@@ -148,7 +168,7 @@ namespace msm {
   *
   */
  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
+  [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
    A* bases,
    int bases_size,
    int precompute_factor,
--- a/icicle/include/ntt/ntt_impl.cuh
+++ b/icicle/include/ntt/ntt_impl.cuh
@@ -32,6 +32,7 @@ namespace mxntt {
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
+    S* linear_twiddle, // twiddles organized as [1,w,w^2,...] for coset-eval in fast-tw mode
    int ntt_size,
    int max_logn,
    int batch_size,
--- a/icicle/include/polynomials/polynomial_backend.h
+++ b/icicle/include/polynomials/polynomial_backend.h
@@ -56,6 +56,7 @@ namespace polynomials {
    // Evaluation methods
    virtual void evaluate(PolyContext op, const D* domain_x, I* eval /*OUT*/) = 0;
    virtual void evaluate_on_domain(PolyContext op, const D* domain, uint64_t size, I* evaluations /*OUT*/) = 0;
+    virtual void evaluate_on_rou_domain(PolyContext op, uint64_t domain_log_size, I* evals /*OUT*/) = 0;

    // Methods to copy coefficients to host memory
    virtual C get_coeff(PolyContext op, uint64_t coeff_idx) = 0;
@@ -64,8 +65,6 @@ namespace polynomials {
    // Methods to get views of coefficients and evaluations, including device id
    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
    get_coefficients_view(PolyContext p) = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
  };

 } // namespace polynomials
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -71,10 +71,8 @@ namespace polynomials {
    virtual std::pair<const C*, uint64_t> get_coefficients() = 0;
    virtual std::pair<const I*, uint64_t> get_rou_evaluations() = 0;

-    // Methods to get views of coefficients and evaluations, including device id.
+    // Methods to get views of coefficients
    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view() = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;

    // Method for printing the context state to an output stream.
    virtual void print(std::ostream& os) = 0;
--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -68,6 +68,7 @@ namespace polynomials {
    Image operator()(const Domain& x) const;
    void evaluate(const Domain* x, Image* eval /*OUT*/) const;
    void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+    void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory

    // Method to obtain the degree of the polynomial
    int64_t degree();
@@ -77,10 +78,8 @@ namespace polynomials {
    // caller is allocating output memory. If coeff==nullptr, returning nof_coeff only
    uint64_t copy_coeffs(Coeff* host_coeffs, uint64_t start_idx, uint64_t end_idx) const;

-    // Methods for obtaining a view of the coefficients or evaluations
+    // Methods for obtaining a view of the coefficients
    std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-    std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);

    // Overload stream insertion operator for printing.
    friend std::ostream& operator<<(std::ostream& os, Polynomial& poly)
--- a/icicle/include/poseidon2/constants/babybear_poseidon2.h
+++ b/icicle/include/poseidon2/constants/babybear_poseidon2.h
--- a/icicle/include/poseidon2/constants/bn254_poseidon2.h
+++ b/icicle/include/poseidon2/constants/bn254_poseidon2.h
--- a/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
+++ b/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
@@ -0,0 +1,705 @@
+# Remark: This script contains functionality for GF(2^n), but currently works only over GF(p)! A few small adaptations are needed for GF(2^n).
+from sage.rings.polynomial.polynomial_gf2x import GF2X_BuildIrred_list
+from math import *
+import itertools
+
+CURVE_NAME = "bn254"
+
+###########################################################################
+# p = 18446744069414584321 # GoldiLocks
+# p = 2013265921 # BabyBear
+# p = 52435875175126190479447740508185965837690552500527637822603658699938581184513 # BLS12-381
+p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
+# p = 28948022309329048855892746252171976963363056481941560715954676764349967630337 # Pasta (Pallas)
+# p = 28948022309329048855892746252171976963363056481941647379679742748393362948097 # Pasta (Vesta)
+
+n = len(p.bits()) # bit
+# t = 12 # GoldiLocks (t = 12 for sponge, t = 8 for compression)
+# t = 16 # BabyBear (t = 24 for sponge, t = 16 for compression)
+# t = 3 # BN254/BN256, BLS12-381, Pallas, Vesta (t = 3 for sponge, t = 2 for compression)
+
+TS = [2, 3, 4, 8, 12, 16, 20, 24]
+
+FIELD = 1
+SBOX = 0
+FIELD_SIZE = n
+
+def get_alpha(p):
+    for alpha in range(3, p):
+        if gcd(alpha, p-1) == 1:
+            break
+    return alpha
+
+alpha = get_alpha(p)
+
+def get_sbox_cost(R_F, R_P, N, t):
+    return int(t * R_F + R_P)
+
+def get_size_cost(R_F, R_P, N, t):
+    n = ceil(float(N) / t)
+    return int((N * R_F) + (n * R_P))
+
+def poseidon_calc_final_numbers_fixed(p, t, alpha, M, security_margin):
+    # [Min. S-boxes] Find best possible for t and N
+    n = ceil(log(p, 2))
+    N = int(n * t)
+    cost_function = get_sbox_cost
+    ret_list = []
+    (R_F, R_P) = find_FD_round_numbers(p, t, alpha, M, cost_function, security_margin)
+    min_sbox_cost = cost_function(R_F, R_P, N, t)
+    ret_list.append(R_F)
+    ret_list.append(R_P)
+    ret_list.append(min_sbox_cost)
+
+    # [Min. Size] Find best possible for t and N
+    # Minimum number of S-boxes for fixed n results in minimum size also (round numbers are the same)!
+    min_size_cost = get_size_cost(R_F, R_P, N, t)
+    ret_list.append(min_size_cost)
+
+    return ret_list # [R_F, R_P, min_sbox_cost, min_size_cost]
+
+def find_FD_round_numbers(p, t, alpha, M, cost_function, security_margin):
+    n = ceil(log(p, 2))
+    N = int(n * t)
+
+    sat_inequiv = sat_inequiv_alpha
+
+    R_P = 0
+    R_F = 0
+    min_cost = float("inf")
+    max_cost_rf = 0
+    # Brute-force approach
+    for R_P_t in range(1, 500):
+        for R_F_t in range(4, 100):
+            if R_F_t % 2 == 0:
+                if (sat_inequiv(p, t, R_F_t, R_P_t, alpha, M) == True):
+                    if security_margin == True:
+                        R_F_t += 2
+                        R_P_t = int(ceil(float(R_P_t) * 1.075))
+                    cost = cost_function(R_F_t, R_P_t, N, t)
+                    if (cost < min_cost) or ((cost == min_cost) and (R_F_t < max_cost_rf)):
+                        R_P = ceil(R_P_t)
+                        R_F = ceil(R_F_t)
+                        min_cost = cost
+                        max_cost_rf = R_F
+    return (int(R_F), int(R_P))
+
+def sat_inequiv_alpha(p, t, R_F, R_P, alpha, M):
+    N = int(FIELD_SIZE * NUM_CELLS)
+    
+    if alpha > 0:
+        R_F_1 = 6 if M <= ((floor(log(p, 2) - ((alpha-1)/2.0))) * (t + 1)) else 10 # Statistical
+        R_F_2 = 1 + ceil(log(2, alpha) * min(M, FIELD_SIZE)) + ceil(log(t, alpha)) - R_P # Interpolation
+        R_F_3 = (log(2, alpha) * min(M, log(p, 2))) - R_P # Groebner 1
+        R_F_4 = t - 1 + log(2, alpha) * min(M / float(t + 1), log(p, 2) / float(2)) - R_P # Groebner 2
+        R_F_5 = (t - 2 + (M / float(2 * log(alpha, 2))) - R_P) / float(t - 1) # Groebner 3
+        R_F_max = max(ceil(R_F_1), ceil(R_F_2), ceil(R_F_3), ceil(R_F_4), ceil(R_F_5))
+        
+        # Addition due to https://eprint.iacr.org/2023/537.pdf
+        r_temp = floor(t / 3.0)
+        over = (R_F - 1) * t + R_P + r_temp + r_temp * (R_F / 2.0) + R_P + alpha
+        under = r_temp * (R_F / 2.0) + R_P + alpha
+        binom_log = log(binomial(over, under), 2)
+        if binom_log == inf:
+            binom_log = M + 1
+        cost_gb4 = ceil(2 * binom_log) # Paper uses 2.3727, we are more conservative here
+
+        return ((R_F >= R_F_max) and (cost_gb4 >= M))
+    else:
+        print("Invalid value for alpha!")
+        exit(1)
+
+def grain_sr_generator():
+    bit_sequence = INIT_SEQUENCE
+    for _ in range(0, 160):
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+
+    while True:
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+        while new_bit == 0:
+            new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+            bit_sequence.pop(0)
+            bit_sequence.append(new_bit)
+            new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+            bit_sequence.pop(0)
+            bit_sequence.append(new_bit)
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+        yield new_bit
+
+def grain_random_bits(num_bits):
+    random_bits = [next(grain_gen) for i in range(0, num_bits)]
+    # random_bits.reverse() ## Remove comment to start from least significant bit
+    random_int = int("".join(str(i) for i in random_bits), 2)
+    return random_int
+
+def init_generator(field, sbox, n, t, R_F, R_P):
+    # Generate initial sequence based on parameters
+    bit_list_field = [_ for _ in (bin(FIELD)[2:].zfill(2))]
+    bit_list_sbox = [_ for _ in (bin(SBOX)[2:].zfill(4))]
+    bit_list_n = [_ for _ in (bin(FIELD_SIZE)[2:].zfill(12))]
+    bit_list_t = [_ for _ in (bin(NUM_CELLS)[2:].zfill(12))]
+    bit_list_R_F = [_ for _ in (bin(R_F)[2:].zfill(10))]
+    bit_list_R_P = [_ for _ in (bin(R_P)[2:].zfill(10))]
+    bit_list_1 = [1] * 30
+    global INIT_SEQUENCE
+    INIT_SEQUENCE = bit_list_field + bit_list_sbox + bit_list_n + bit_list_t + bit_list_R_F + bit_list_R_P + bit_list_1
+    INIT_SEQUENCE = [int(_) for _ in INIT_SEQUENCE]
+
+def generate_constants(field, n, t, R_F, R_P, prime_number):
+    round_constants = []
+    # num_constants = (R_F + R_P) * t # Poseidon
+    num_constants = (R_F * t) + R_P # Poseidon2
+
+    if field == 0:
+        for i in range(0, num_constants):
+            random_int = grain_random_bits(n)
+            round_constants.append(random_int)
+    elif field == 1:
+        for i in range(0, num_constants):
+            random_int = grain_random_bits(n)
+            while random_int >= prime_number:
+                # print("[Info] Round constant is not in prime field! Taking next one.")
+                random_int = grain_random_bits(n)
+            round_constants.append(random_int)
+            # Add (t-1) zeroes for Poseidon2 if partial round
+            if i >= ((R_F/2) * t) and i < (((R_F/2) * t) + R_P):
+                round_constants.extend([0] * (t-1))
+    return round_constants
+
+def print_round_constants(round_constants, n, field):
+    print("Number of round constants:", len(round_constants))
+
+    if field == 0:
+        print("Round constants for GF(2^n):")
+    elif field == 1:
+        print("Round constants for GF(p):")
+    hex_length = int(ceil(float(n) / 4)) + 2 # +2 for "0x"
+    print(["{0:#0{1}x}".format(entry, hex_length) for entry in round_constants])
+
+def create_mds_p(n, t):
+    M = matrix(F, t, t)
+
+    # Sample random distinct indices and assign to xs and ys
+    while True:
+        flag = True
+        rand_list = [F(grain_random_bits(n)) for _ in range(0, 2*t)]
+        while len(rand_list) != len(set(rand_list)): # Check for duplicates
+            rand_list = [F(grain_random_bits(n)) for _ in range(0, 2*t)]
+        xs = rand_list[:t]
+        ys = rand_list[t:]
+        # xs = [F(ele) for ele in range(0, t)]
+        # ys = [F(ele) for ele in range(t, 2*t)]
+        for i in range(0, t):
+            for j in range(0, t):
+                if (flag == False) or ((xs[i] + ys[j]) == 0):
+                    flag = False
+                else:
+                    entry = (xs[i] + ys[j])^(-1)
+                    M[i, j] = entry
+        if flag == False:
+            continue
+        return M
+
+def generate_vectorspace(round_num, M, M_round, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+    V = VectorSpace(F, t)
+    if round_num == 0:
+        return V
+    elif round_num == 1:
+        return V.subspace(V.basis()[s:])
+    else:
+        mat_temp = matrix(F)
+        for i in range(0, round_num-1):
+            add_rows = []
+            for j in range(0, s):
+                add_rows.append(M_round[i].rows()[j][s:])
+            mat_temp = matrix(mat_temp.rows() + add_rows)
+        r_k = mat_temp.right_kernel()
+        extended_basis_vectors = []
+        for vec in r_k.basis():
+            extended_basis_vectors.append(vector([0]*s + list(vec)))
+        S = V.subspace(extended_basis_vectors)
+
+        return S
+
+def subspace_times_matrix(subspace, M, NUM_CELLS):
+    t = NUM_CELLS
+    V = VectorSpace(F, t)
+    subspace_basis = subspace.basis()
+    new_basis = []
+    for vec in subspace_basis:
+        new_basis.append(M * vec)
+    new_subspace = V.subspace(new_basis)
+    return new_subspace
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_1(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+    r = floor((t - s) / float(s))
+
+    # Generate round matrices
+    M_round = []
+    for j in range(0, t+1):
+        M_round.append(M^(j+1))
+
+    for i in range(1, r+1):
+        mat_test = M^i
+        entry = mat_test[0, 0]
+        mat_target = matrix.circulant(vector([entry] + ([F(0)] * (t-1))))
+
+        if (mat_test - mat_target) == matrix.circulant(vector([F(0)] * (t))):
+            return [False, 1]
+
+        S = generate_vectorspace(i, M, M_round, t)
+        V = VectorSpace(F, t)
+
+        basis_vectors= []
+        for eigenspace in mat_test.eigenspaces_right(format='galois'):
+            if (eigenspace[0] not in F):
+                continue
+            vector_subspace = eigenspace[1]
+            intersection = S.intersection(vector_subspace)
+            basis_vectors += intersection.basis()
+        IS = V.subspace(basis_vectors)
+
+        if IS.dimension() >= 1 and IS != V:
+            return [False, 2]
+        for j in range(1, i+1):
+            S_mat_mul = subspace_times_matrix(S, M^j, t)
+            if S == S_mat_mul:
+                print("S.basis():\n", S.basis())
+                return [False, 3]
+
+    return [True, 0]
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_2(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+
+    V = VectorSpace(F, t)
+    trail = [None, None]
+    test_next = False
+    I = range(0, s)
+    I_powerset = list(sage.misc.misc.powerset(I))[1:]
+    for I_s in I_powerset:
+        test_next = False
+        new_basis = []
+        for l in I_s:
+            new_basis.append(V.basis()[l])
+        IS = V.subspace(new_basis)
+        for i in range(s, t):
+            new_basis.append(V.basis()[i])
+        full_iota_space = V.subspace(new_basis)
+        for l in I_s:
+            v = V.basis()[l]
+            while True:
+                delta = IS.dimension()
+                v = M * v
+                IS = V.subspace(IS.basis() + [v])
+                if IS.dimension() == t or IS.intersection(full_iota_space) != IS:
+                    test_next = True
+                    break
+                if IS.dimension() <= delta:
+                    break
+            if test_next == True:
+                break
+        if test_next == True:
+            continue
+        return [False, [IS, I_s]]
+
+    return [True, None]
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_3(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+
+    V = VectorSpace(F, t)
+
+    l = 4*t
+    for r in range(2, l+1):
+        next_r = False
+        res_alg_2 = algorithm_2(M^r, t)
+        if res_alg_2[0] == False:
+            return [False, None]
+
+        # if res_alg_2[1] == None:
+        #     continue
+        # IS = res_alg_2[1][0]
+        # I_s = res_alg_2[1][1]
+        # for j in range(1, r):
+        #     IS = subspace_times_matrix(IS, M, t)
+        #     I_j = []
+        #     for i in range(0, s):
+        #         new_basis = []
+        #         for k in range(0, t):
+        #             if k != i:
+        #                 new_basis.append(V.basis()[k])
+        #         iota_space = V.subspace(new_basis)
+        #         if IS.intersection(iota_space) != iota_space:
+        #             single_iota_space = V.subspace([V.basis()[i]])
+        #             if IS.intersection(single_iota_space) == single_iota_space:
+        #                 I_j.append(i)
+        #             else:
+        #                 next_r = True
+        #                 break
+        #     if next_r == True:
+        #         break
+        # if next_r == True:
+        #     continue
+        # return [False, [IS, I_j, r]]
+
+    return [True, None]
+
+def check_minpoly_condition(M, NUM_CELLS):
+    max_period = 2*NUM_CELLS
+    all_fulfilled = True
+    M_temp = M
+    for i in range(1, max_period + 1):
+        if not ((M_temp.minimal_polynomial().degree() == NUM_CELLS) and (M_temp.minimal_polynomial().is_irreducible() == True)):
+            all_fulfilled = False
+            break
+        M_temp = M * M_temp
+    return all_fulfilled
+
+def generate_matrix(FIELD, FIELD_SIZE, NUM_CELLS):
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        mds_matrix = create_mds_p(FIELD_SIZE, NUM_CELLS)
+        result_1 = algorithm_1(mds_matrix, NUM_CELLS)
+        result_2 = algorithm_2(mds_matrix, NUM_CELLS)
+        result_3 = algorithm_3(mds_matrix, NUM_CELLS)
+        while result_1[0] == False or result_2[0] == False or result_3[0] == False:
+            mds_matrix = create_mds_p(FIELD_SIZE, NUM_CELLS)
+            result_1 = algorithm_1(mds_matrix, NUM_CELLS)
+            result_2 = algorithm_2(mds_matrix, NUM_CELLS)
+            result_3 = algorithm_3(mds_matrix, NUM_CELLS)
+        return mds_matrix
+
+def generate_matrix_full(NUM_CELLS):
+    M = None
+    if t == 2:
+        M = matrix.circulant(vector([F(2), F(1)]))
+    elif t == 3:
+        M = matrix.circulant(vector([F(2), F(1), F(1)]))
+    elif t == 4:
+        M = matrix(F, [[F(5), F(7), F(1), F(3)], [F(4), F(6), F(1), F(1)], [F(1), F(3), F(5), F(7)], [F(1), F(1), F(4), F(6)]])
+    elif (t % 4) == 0:
+        M = matrix(F, t, t)
+        # M_small = matrix.circulant(vector([F(3), F(2), F(1), F(1)]))
+        M_small = matrix(F, [[F(5), F(7), F(1), F(3)], [F(4), F(6), F(1), F(1)], [F(1), F(3), F(5), F(7)], [F(1), F(1), F(4), F(6)]])
+        small_num = t // 4
+        for i in range(0, small_num):
+            for j in range(0, small_num):
+                if i == j:
+                    M[i*4:(i+1)*4,j*4:(j+1)*4] = 2* M_small
+                else:
+                    M[i*4:(i+1)*4,j*4:(j+1)*4] = M_small
+    else:
+        print("Error: No matrix for these parameters.")
+        exit()
+    return M
+
+def generate_matrix_partial(FIELD, FIELD_SIZE, NUM_CELLS): ## TODO: Prioritize small entries
+    entry_max_bit_size = FIELD_SIZE
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        M = None
+        if t == 2:
+            M = matrix(F, [[F(2), F(1)], [F(1), F(3)]])
+        elif t == 3:
+            M = matrix(F, [[F(2), F(1), F(1)], [F(1), F(2), F(1)], [F(1), F(1), F(3)]])
+        else:
+            M_circulant = matrix.circulant(vector([F(0)] + [F(1) for _ in range(0, NUM_CELLS - 1)]))
+            M_diagonal = matrix.diagonal([F(grain_random_bits(entry_max_bit_size)) for _ in range(0, NUM_CELLS)])
+            M = M_circulant + M_diagonal
+            # while algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False:
+            while check_minpoly_condition(M, NUM_CELLS) == False:
+                M_diagonal = matrix.diagonal([F(grain_random_bits(entry_max_bit_size)) for _ in range(0, NUM_CELLS)])
+                M = M_circulant + M_diagonal
+        
+        if(algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False):
+            print("Error: Generated partial matrix is not secure w.r.t. subspace trails.")
+            exit()
+        return M
+
+def generate_matrix_partial_small_entries(FIELD, FIELD_SIZE, NUM_CELLS):
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        M_circulant = matrix.circulant(vector([F(0)] + [F(1) for _ in range(0, NUM_CELLS - 1)]))
+        combinations = list(itertools.product(range(2, 6), repeat=NUM_CELLS))
+        for entry in combinations:
+            M = M_circulant + matrix.diagonal(vector(F, list(entry)))
+            print(M)
+            # if M.is_invertible() == False or algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False:
+            if M.is_invertible() == False or check_minpoly_condition(M, NUM_CELLS) == False:
+                continue
+            return M
+
+def matrix_partial_m_1(matrix_partial, NUM_CELLS):
+    M_circulant = matrix.identity(F, NUM_CELLS)
+    return matrix_partial - M_circulant
+
+def print_linear_layer(M, n, t):
+    print("n:", n)
+    print("t:", t)
+    print("N:", (n * t))
+    print("Result Algorithm 1:\n", algorithm_1(M, NUM_CELLS))
+    print("Result Algorithm 2:\n", algorithm_2(M, NUM_CELLS))
+    print("Result Algorithm 3:\n", algorithm_3(M, NUM_CELLS))
+    hex_length = int(ceil(float(n) / 4)) + 2 # +2 for "0x"
+    print("Prime number:", "0x" + hex(PRIME_NUMBER))
+    matrix_string = "["
+    for i in range(0, t):
+        matrix_string += str(["{0:#0{1}x}".format(int(entry), hex_length) for entry in M[i]])
+        if i < (t-1):
+            matrix_string += ","
+    matrix_string += "]"
+    print("MDS matrix:\n", matrix_string)
+
+def calc_equivalent_matrices(MDS_matrix_field):
+    # Following idea: Split M into M' * M'', where M'' is "cheap" and M' can move before the partial nonlinear layer
+    # The "previous" matrix layer is then M * M'. Due to the construction of M', the M[0,0] and v values will be the same for the new M' (and I also, obviously)
+    # Thus: Compute the matrices, store the w_hat and v_hat values
+
+    MDS_matrix_field_transpose = MDS_matrix_field.transpose()
+
+    w_hat_collection = []
+    v_collection = []
+    v = MDS_matrix_field_transpose[[0], list(range(1,t))]
+
+    M_mul = MDS_matrix_field_transpose
+    M_i = matrix(F, t, t)
+    for i in range(R_P_FIXED - 1, -1, -1):
+        M_hat = M_mul[list(range(1,t)), list(range(1,t))]
+        w = M_mul[list(range(1,t)), [0]]
+        v = M_mul[[0], list(range(1,t))]
+        v_collection.append(v.list())
+        w_hat = M_hat.inverse() * w
+        w_hat_collection.append(w_hat.list())
+
+        # Generate new M_i, and multiplication M * M_i for "previous" round
+        M_i = matrix.identity(t)
+        M_i[list(range(1,t)), list(range(1,t))] = M_hat
+        M_mul = MDS_matrix_field_transpose * M_i
+
+    return M_i, v_collection, w_hat_collection, MDS_matrix_field_transpose[0, 0]
+
+def calc_equivalent_constants(constants, MDS_matrix_field):
+    constants_temp = [constants[index:index+t] for index in range(0, len(constants), t)]
+
+    MDS_matrix_field_transpose = MDS_matrix_field.transpose()
+
+    # Start moving round constants up
+    # Calculate c_i' = M^(-1) * c_(i+1)
+    # Split c_i': Add c_i'[0] AFTER the S-box, add the rest to c_i
+    # I.e.: Store c_i'[0] for each of the partial rounds, and make c_i = c_i + c_i' (where now c_i'[0] = 0)
+    num_rounds = R_F_FIXED + R_P_FIXED
+    R_f = R_F_FIXED / 2
+    for i in range(num_rounds - 2 - R_f, R_f - 1, -1):
+        inv_cip1 = list(vector(constants_temp[i+1]) * MDS_matrix_field_transpose.inverse())
+        constants_temp[i] = list(vector(constants_temp[i]) + vector([0] + inv_cip1[1:]))
+        constants_temp[i+1] = [inv_cip1[0]] + [0] * (t-1)
+
+    return constants_temp
+
+def poseidon(input_words, matrix, round_constants):
+
+    R_f = int(R_F_FIXED / 2)
+
+    round_constants_counter = 0
+
+    state_words = list(input_words)
+
+    # First full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    # Middle partial rounds
+    for r in range(0, R_P_FIXED):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        state_words[0] = (state_words[0])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    # Last full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    return state_words
+
+def poseidon2(input_words, matrix_full, matrix_partial, round_constants):
+
+    R_f = int(R_F_FIXED / 2)
+
+    round_constants_counter = 0
+
+    state_words = list(input_words)
+
+    # First matrix mul
+    state_words = list(matrix_full * vector(state_words))
+
+    # First full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix_full * vector(state_words))
+
+    # Middle partial rounds
+    for r in range(0, R_P_FIXED):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        state_words[0] = (state_words[0])^alpha
+        state_words = list(matrix_partial * vector(state_words))
+
+    # Last full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix_full * vector(state_words))
+
+    return state_words
+
+def to_bytes(value, indent):
+    l = len(hex(p - 1))
+    if l % 2 == 1:
+        l = l + 1
+    value = hex(int(value))[2:]
+    value = value.zfill(l - 2)
+    value_bytes = reversed(["0x" + value[i:i + 2] for i in range(0, len(value), 2)])
+    print(" " * indent +  ", ".join(value_bytes) + ",")
+
+print("#pragma once")
+print(f"#ifndef {CURVE_NAME.upper()}_POSEIDON2_H")
+print(f"#define {CURVE_NAME.upper()}_POSEIDON2_H")
+print()
+print(f"namespace poseidon2_constants_{CURVE_NAME} {{")
+
+for t in TS:
+    NUM_CELLS = t
+    R_F_FIXED, R_P_FIXED, _, _ = poseidon_calc_final_numbers_fixed(p, t, alpha, 128, True)
+
+    INIT_SEQUENCE = []
+
+    PRIME_NUMBER = p
+
+    F = GF(PRIME_NUMBER)
+
+    grain_gen = grain_sr_generator()
+
+    # Init
+    init_generator(FIELD, SBOX, FIELD_SIZE, NUM_CELLS, R_F_FIXED, R_P_FIXED)
+
+    # Round constants
+    round_constants = generate_constants(FIELD, FIELD_SIZE, NUM_CELLS, R_F_FIXED, R_P_FIXED, PRIME_NUMBER)
+    # print_round_constants(round_constants, FIELD_SIZE, FIELD)
+
+    # Matrix
+    # MDS = generate_matrix(FIELD, FIELD_SIZE, NUM_CELLS)
+    MATRIX_FULL = generate_matrix_full(NUM_CELLS)
+    MATRIX_PARTIAL = generate_matrix_partial(FIELD, FIELD_SIZE, NUM_CELLS)
+    MATRIX_PARTIAL_DIAGONAL_M_1 = [matrix_partial_m_1(MATRIX_PARTIAL, NUM_CELLS)[i,i] for i in range(0, NUM_CELLS)]
+
+    print()
+    print(f"  namespace t{t} {{")
+    print(f"    int internal_rounds = {R_P_FIXED};")
+    print()
+    print(f"    int alpha = {alpha};")
+    print()
+
+    # # MDS
+    # print("pub static ref MDS{}: Vec<Vec<Scalar>> = vec![".format(t))
+    # for vec in MDS:
+    #     print("vec![", end="")
+    #     for val in vec:
+    #         to_hex(val)
+    #     print("],")
+    # print("];")
+    # print()
+
+    # Efficient partial matrix (diagonal - 1)
+    print("    unsigned char mat_diag_m_1[] = {")
+    for val in MATRIX_PARTIAL_DIAGONAL_M_1:
+        to_bytes(val, 6)
+    print("    };")
+    print()
+
+    # # Efficient partial matrix (full)
+    # print("    unsigned char mat_internal[] = {")
+    # for vec in MATRIX_PARTIAL:
+    #     for val in vec:
+    #         to_bytes(val, 6)
+    #     print()
+    # print("    };")
+    # print()
+
+    # Round constants
+    print("    unsigned char round_constants[] = {")
+    for (i,val) in enumerate(round_constants):
+        if (i % t == 0 or (i < (R_F_FIXED / 2) * t) or (i > (R_F_FIXED / 2 + R_P_FIXED) * t)):
+            to_bytes(val, 6)
+    print("    };")
+    print("  }")
+    print()
+
+    # state_in  = vector([F(i) for i in range(t)])
+    # # state_out = poseidon(state_in, MDS, round_constants)
+    # state_out = poseidon2(state_in, MATRIX_FULL, MATRIX_PARTIAL, round_constants)
+
+    # for (i,val) in enumerate(state_in):
+    #     if i % t == 0:
+    #         print("vec![", end="")
+    #     to_bytes(val)
+    #     if i % t == t - 1:
+    #         print("],")
+    # print("];")
+
+    # for (i,val) in enumerate(state_out):
+    #     if i % t == 0:
+    #         print("vec![", end="")
+    #     to_bytes(val)
+    #     if i % t == t - 1:
+    #         print("],")
+    # print("];")
+
+print("}")
+print("#endif")
--- a/icicle/include/poseidon2/poseidon2.cuh
+++ b/icicle/include/poseidon2/poseidon2.cuh
@@ -0,0 +1,131 @@
+#pragma once
+#ifndef POSEIDON2_H
+#define POSEIDON2_H
+
+#include <cstdint>
+#include <stdexcept>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "utils/utils.h"
+
+/**
+ * @namespace poseidon2
+ * Implementation of the [Poseidon2 hash function](https://eprint.iacr.org/2019/458.pdf)
+ * Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
+ */
+namespace poseidon2 {
+  /**
+   * For most of the Poseidon2 configurations this is the case
+   */
+  const int EXTERNAL_ROUNDS_DEFAULT = 8;
+
+  enum DiffusionStrategy {
+    DEFAULT_DIFFUSION,
+    MONTGOMERY,
+  };
+
+  enum MdsType { DEFAULT_MDS, PLONKY };
+
+  enum PoseidonMode {
+    COMPRESSION,
+    PERMUTATION,
+  };
+
+  /**
+   * @struct Poseidon2Constants
+   * This constants are enough to define a Poseidon2 instantce
+   * @param round_constants A pointer to round constants allocated on the device
+   * @param mds_matrix A pointer to an mds matrix allocated on the device
+   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
+   * @param sparse_matrices A pointer to sparse matrices allocated on the device
+   */
+  template <typename S>
+  struct Poseidon2Constants {
+    int width;
+    int alpha;
+    int internal_rounds;
+    int external_rounds;
+    S* round_constants = nullptr;
+    S* internal_matrix_diag = nullptr;
+    MdsType mds_type;
+    DiffusionStrategy diffusion;
+  };
+
+  /**
+   * @struct Poseidon2Config
+   * Struct that encodes various Poseidon2 parameters.
+   */
+  struct Poseidon2Config {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    bool are_states_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
+    PoseidonMode mode;
+    int output_index;
+    bool
+      is_async; /**< Whether to run the Poseidon2 asynchronously. If set to `true`, the poseidon_hash function will be
+                 *   non-blocking and you'd need to synchronize it explicitly by running
+                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
+                 *   function will block the current CPU thread. */
+  };
+
+  static Poseidon2Config default_poseidon2_config(
+    int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    Poseidon2Config config = {
+      ctx,   // ctx
+      false, // are_states_on_device
+      false, // are_outputs_on_device
+      PoseidonMode::COMPRESSION,
+      1,     // output_index
+      false, // is_async
+    };
+    return config;
+  }
+
+  template <typename S>
+  cudaError_t create_poseidon2_constants(
+    int width,
+    int alpha,
+    int internal_rounds,
+    int external_rounds,
+    const S* round_constants,
+    const S* internal_matrix_diag,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon_constants);
+
+  /**
+   * Loads pre-calculated optimized constants, moves them to the device
+   */
+  template <typename S>
+  cudaError_t init_poseidon2_constants(
+    int width,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* constants);
+
+  template <typename S>
+  cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx);
+
+  /**
+   * Compute the poseidon hash over a sequence of preimages.
+   * Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
+   * @param T size of the poseidon state, should be equal to {arity + 1}
+   * @param states a pointer to the input data. May be allocated on device or on host, regulated
+   * by the config. May point to a string of preimages or a string of states filled with preimages.
+   * @param output a pointer to the output data. May be allocated on device or on host, regulated
+   * by the config. Must be at least of size [number_of_states](@ref number_of_states)
+   * @param number_of_states number of input blocks of size T-1 (arity)
+   */
+  template <typename S, int T>
+  cudaError_t poseidon2_hash(
+    const S* states,
+    S* output,
+    size_t number_of_states,
+    const Poseidon2Constants<S>& constants,
+    const Poseidon2Config& config);
+} // namespace poseidon2
+
+#endif
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -113,6 +113,27 @@ namespace vec_ops {
    device_context::DeviceContext& ctx,
    bool on_device,
    bool is_async);
+
+  struct BitReverseConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
+    bool is_input_on_device;  /**< True if `input` is on device and false if it is not. Default value: false. */
+    bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */
+    bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
+                    *   function will block the current CPU thread. */
+  };
+  static BitReverseConfig
+  DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    BitReverseConfig config = {
+      ctx,   // ctx
+      false, // is_input_on_device
+      false, // is_output_on_device
+      false, // is_async
+    };
+    return config;
+  }
 } // namespace vec_ops

 #endif
--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -3,6 +3,7 @@ if (EXT_FIELD)
 endif ()

 SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)
+SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252)

 set(TARGET icicle_field)

@@ -21,17 +22,21 @@ set(POLYNOMIAL_SOURCE_FILES
    ${SRC}/polynomials/cuda_backend/polynomial_cuda_backend.cu
    ${SRC}/polynomials/polynomials_c_api.cu)

-list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES})
-
 # TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
 if (DEFINED CURVE)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon/extern.cu)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
 endif()

+if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_POSEIDON2)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon2/extern.cu)
+endif()
+
 if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
  list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
+  list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT  
 endif()

 add_library(${TARGET} STATIC ${FIELD_SOURCE})
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -224,7 +224,7 @@ namespace keccak {

  template <int C, int D>
  cudaError_t
-  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
+  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
  {
    CHK_INIT_IF_RETURN();
    cudaStream_t& stream = config.ctx.stream;
@@ -245,7 +245,7 @@ namespace keccak {
      CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_blocks * (D / 8), stream));
    }

-    int number_of_threads = 1024;
+    int number_of_threads = 512;
    int number_of_gpu_blocks = (number_of_blocks - 1) / number_of_threads + 1;
    keccak_hash_blocks<C, D><<<number_of_gpu_blocks, number_of_threads, 0, stream>>>(
      input_device, input_block_size, number_of_blocks, output_device);
@@ -262,13 +262,13 @@ namespace keccak {
  }

  extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
  {
    return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
  }

  extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
  {
    return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
  }
--- a/icicle/src/hash/keccak/test.cu
+++ b/icicle/src/hash/keccak/test.cu
@@ -1,4 +1,4 @@
-#include "utils/device_context.cuh"
+#include "gpu-utils/device_context.cuh"
 #include "keccak.cu"

 // #define DEBUG
@@ -51,7 +51,7 @@ int main(int argc, char* argv[])

  START_TIMER(keccak_timer);
  KeccakConfig config = default_keccak_config();
-  keccak256(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
+  keccak256_cuda(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
  END_TIMER(keccak_timer, "Keccak")

  for (int i = 0; i < number_of_blocks; i++) {
--- a/icicle/src/msm/Makefile
+++ b/icicle/src/msm/Makefile
@@ -1,4 +1,8 @@
+build_msm:
+	mkdir -p work
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
+
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu
-	work/test_msm
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I.  -I../../include tests/msm_test.cu
+	work/test_msm
--- a/icicle/src/msm/extern.cu
+++ b/icicle/src/msm/extern.cu
@@ -8,6 +8,17 @@ using namespace field_config;
 #include "utils/utils.h"

 namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)(
+    affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points)
+  {
+    return precompute_msm_points<affine_t, projective_t>(points, msm_size, config, output_points);
+  }
  /**
   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
   * template parameters (where the curve is given by `-DCURVE` env variable during build):
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
release-bot	3d01c09c82	Bump rust crates' version icicle-babybear@2.5.0 icicle-bls12-377@2.5.0 icicle-bls12-381@2.5.0 icicle-bn254@2.5.0 icicle-bw6-761@2.5.0 icicle-core@2.5.0 icicle-cuda-runtime@2.5.0 icicle-grumpkin@2.5.0 icicle-hash@2.5.0 icicle-stark252@2.5.0 Generated by cargo-workspaces	2024-06-17 13:17:24 +00:00
HadarIngonyama	8936d9c800	MSM - supporting all window sizes (#534 ) This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-06-17 15:57:24 +03:00
Jeremy Felder	af9ec76506	Fix link and correct path for running test deploy workflow (#542 ) ## Describe the changes Fixes a link issue in docs preventing deployment	2024-06-17 15:44:15 +03:00
Otsar	cdd99d2a46	recreated images for poseidon.md (#541 ) Fixed 3 images shown in low quality - i have recreated the 3 images - please check me to see that i have not made a mistake	2024-06-17 12:16:26 +03:00
Jeremy Felder	3e551762c0	Updated alt text for images and fixed broken link	2024-06-16 18:35:42 +03:00
Otsar	37c22e81e7	Update poseidon.md fixed - added arrows	2024-06-16 15:01:12 +03:00
Otsar	69e73ffa3e	Update poseidon.md Fixed image quality	2024-06-16 11:42:46 +03:00
cangqiaoyuzhuo	512e1ca372	chore: remove repeat word (#540 ) ## Describe the changes remove repeat word ## Linked Issues Resolves # Signed-off-by: cangqiaoyuzhuo <850072022@qq.com>	2024-06-13 11:53:22 +03:00
VitaliiH	e19a869691	accumulate stwo (#535 ) adds in-place vector addition and api as accumulate	2024-06-10 12:24:58 +02:00
yshekel	9c55d888ae	workflow curve fix (#536 )	2024-06-09 11:18:23 +03:00
release-bot	18f51de56c	Bump rust crates' version icicle-babybear@2.4.0 icicle-bls12-377@2.4.0 icicle-bls12-381@2.4.0 icicle-bn254@2.4.0 icicle-bw6-761@2.4.0 icicle-core@2.4.0 icicle-cuda-runtime@2.4.0 icicle-grumpkin@2.4.0 icicle-hash@2.4.0 icicle-stark252@2.4.0 Generated by cargo-workspaces	2024-06-06 14:42:36 +00:00
yshekel	33b1f3c794	perf: projective scalar multiplication use dbl() rather than + (#530 )	2024-06-05 20:35:21 +03:00
Karthik Inbasekar	3a276ef23c	added example cpp: example_commit_with_device_memory_view() (#532 ) ## Describe the changes This PR... Added an example for simple commit that makes use of polynomial views. Output attached ``` Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1 Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 f_2]_1 Setup: Generating mock SRS Setup: SRS of length 1025 generated and loaded to device. Took: 19557 milliseconds Setup: Generating polys (on device) f1,f2 of log degree 10 Setup: Gen poly done. Took: 7 milliseconds Computing constraints..start Computing constraints..done. Took: 0 milliseconds Computing Commitments with poly view Commitments done. Took: 29 milliseconds commitment [(f1+f2)^2 + (f1-f2)^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [[2 (f_1^2+f_2^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [(f1+f2)^2 - (f1-f2)^2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] commitment [4 f_1f_2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] ``` ## Linked Issues Resolves #	2024-06-05 18:25:12 +03:00
nonam3e	8e62bde16d	bit reverse (#528 ) This PR adds bit reverse operation support to icicle	2024-06-02 16:37:58 +07:00
Jeremy Felder	417ca77f61	precompute bug fix (#529 ) This PR fixes 2 things: 1. Removes the assertion regarding the precompute factor needing to be a power of 2. There is no such requirement and it works just fine for other values too. 2. Fixes the average bucket size for the large buckets threshold - it depends on the precompute factor.	2024-05-29 13:59:48 +03:00
hadaringonyama	8911a32135	precompute bug fix	2024-05-28 12:48:48 +03:00
release-bot	c6f6e61d60	Bump rust crates' version icicle-babybear@2.3.1 icicle-bls12-377@2.3.1 icicle-bls12-381@2.3.1 icicle-bn254@2.3.1 icicle-bw6-761@2.3.1 icicle-core@2.3.1 icicle-cuda-runtime@2.3.1 icicle-grumpkin@2.3.1 icicle-hash@2.3.1 icicle-stark252@2.3.1 Generated by cargo-workspaces	2024-05-20 13:43:32 +00:00
yshekel	4e3aa63d2f	fix: ntt mixed-radix bug for large ntts (>4G elements) (#523 ) in some cases 32b values would wrap around and cause invalid accesses to wrong elements and memory addresses	2024-05-20 16:42:44 +03:00
Leon Hibnik	db298aefc1	[HOTFIX] rust msm benchmarks (#521 ) ## Describe the changes removes unused host to device copy, adds minimum limit to run MSM benchmarks	2024-05-20 13:51:53 +03:00
yshekel	19a9b76d64	fix: cmake set_gpu_env() and windows build (#520 )	2024-05-20 13:05:45 +03:00
Jeremy Felder	1e343f17a3	Allow overriding compiler's chosen GPU arch via cmake (#518 ) ## Describe the changes This PR modifies icicle/cmake/Common.cmake to set CMAKE_CUDA_ARCHITECTURES to ${CUDA_ARCH} if the user defines the arch, to set CMAKE_CUDA_ARCHITECTURES to native if the cmake version is greater than or equal to 3.24.0. This change has been successfully tested with cmake 3.22.0 and 3.25.2. ## Linked Issues Resolves #167.	2024-05-19 16:03:15 +03:00
liuhao230	cfea6ebb3b	Merge branch 'ingonyama-zk:main' into main	2024-05-17 14:24:02 +08:00
release-bot	76a82bf88e	Bump rust crates' version icicle-babybear@2.3.0 icicle-bls12-377@2.3.0 icicle-bls12-381@2.3.0 icicle-bn254@2.3.0 icicle-bw6-761@2.3.0 icicle-core@2.3.0 icicle-cuda-runtime@2.3.0 icicle-grumpkin@2.3.0 icicle-hash@2.3.0 icicle-stark252@2.3.0 Generated by cargo-workspaces	2024-05-17 04:42:17 +00:00
Vlad	b8310d577e	Feat/vlad/poseidon go binding (#513 )	2024-05-17 07:20:15 +03:00
liu	49c7fa4b28	fix: add the PARENT_SCOPE Signed-off-by: liu <liuhao2206@buaa.edu.cn>	2024-05-17 10:45:09 +08:00
Stas	02059fcfaa	Stas/best-practice-ntt (#517 ) ## Describe the changes Icicle examples: Concurrent Data Transfer and NTT Computation This PR introduces a Best Practice series of examples in c++. Specifically, the example shows how to concurrently transfer data to/from device and execute NTT ## Linked Issues Resolves #	2024-05-16 23:51:49 +03:00
nonam3e	4496520a10	golang examples init (#516 ) ## Describe the changes This PR adds golang examples --------- Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-05-16 19:40:13 +03:00
liu	88a6966a4b	Allow overriding compiler's chosen GPU arch via cmake	2024-05-15 22:40:51 +08:00
yshekel	9c1afe8a44	Polynomial API views replaced by evaluation on rou domain (#514 ) - removed poly API to access view of evaluations. This is a problematic API since it cannot handle small domains and for large domains requires the polynomial to use more memory than need to. - added evaluate_on_rou_domain() API instead that supports any domain size (powers of two size). - the new API can compute to HOST or DEVICE memory - Rust wrapper for evaluate_on_rou_domain() - updated documentation: overview and Rust wrappers - faster division by vanishing poly for common case where numerator is 2N and vanishing poly is of degree N. - allow division a/b where deg(a)<deg(b) instead of throwing an error.	2024-05-15 14:06:23 +03:00
Jeremy Felder	972b924bc0	Update CI to run on some non-code changes (#515 ) ## Describe the changes This PR: - Updates the CI to run on CI workflow file changes - Updates examples CI to run on examples file changes	2024-05-15 13:17:13 +03:00
sukrucildirr	230a1da512	Fix broken link (#512 ) ## Describe the changes There was a broken link is linked to ZKContainer word. ## Linked Issues Resolves #	2024-05-14 08:36:39 +07:00
release-bot	940b283c47	Bump rust crates' version icicle-babybear@2.2.0 icicle-bls12-377@2.2.0 icicle-bls12-381@2.2.0 icicle-bn254@2.2.0 icicle-bw6-761@2.2.0 icicle-core@2.2.0 icicle-cuda-runtime@2.2.0 icicle-grumpkin@2.2.0 icicle-hash@2.2.0 icicle-stark252@2.2.0 Generated by cargo-workspaces	2024-05-09 12:27:17 +00:00
Leon Hibnik	e0412183fd	syntax highlight (#511 ) ## Describe the changes adds syntax highlighting to `rust` and `go`	2024-05-09 15:23:20 +03:00
ChickenLover	9da52bc09f	Feat/roman/poseidon2 (#510 ) # This PR 1. Adds C++ API 2. Renames a lot of API functions 3. Adds inplace poseidon2 4. Makes input const at all poseidon functions 5. Adds benchmark for poseidon2	2024-05-09 19:19:55 +07:00
VitaliiH	49079d0d2a	rust ecntt hotfix (#509 ) ## Describe the changes This PR fixes Rust ECNTT benches and tests --------- Co-authored-by: VitaliiH <Vitaliy@ingo>	2024-05-09 11:21:21 +03:00
ChickenLover	094683d291	Feat/roman/poseidon2 (#507 ) This PR adds support for poseidon2 permutation function as described in https://eprint.iacr.org/2023/323.pdf Reference implementations used (and compared against): https://github.com/HorizenLabs/poseidon2/tree/main https://github.com/Plonky3/Plonky3/tree/main Tasks: - [x] Remove commented code and prints - [ ] Add doc-comments to functions and structs - [x] Fix possible issue with Plonky3 imports - [x] Update NTT/Plonky3 test - [x] Add Plonky3-bn254 test (impossible)	2024-05-09 15:13:43 +07:00
nonam3e	c30e333819	keccak docs (#508 ) This PR adds keccak docs --------- Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>	2024-05-08 23:18:59 +03:00
yshekel	2905d2a469	fix: bug regarding polynomial evaluations view in CUDA backend (#506 ) fixing: (1) not building polynomials and tests for grumpkin curve (no NTT) (2) polynomial API C++ example compilation and (when compilation is fixed) memory corruption (3) bug fix in poly CUDA backend regarding transformation to evaluations in some cases	2024-05-08 21:02:18 +03:00
Jeremy Felder	732ee51552	[CI]: Update Cpp CI to include build args (#503 ) ## Describe the changes This PR adds build args to the Cpp CI and adds grumpkin curve and stark252 field	2024-05-08 14:35:02 +03:00
Jeremy Felder	14997566ff	[FIX]: Fix releasing device set on host thread during multigpu call (#501 ) ## Describe the changes This PR fixes an issue when `RunOnDevice` is called for multi-gpu while other goroutines calling device operations are run outside of `RunOnDevice`. The issue comes from setting a device other than the default device (device 0) on a host thread within `RunOnDevice` and not unsetting that host threads device when `RunOnDevice` finishes. When `RunOnDevice` locks a host thread to ensure that all other calls in the go routine are on the same device, it never unsets that thread’s device. Once the thread is unlocked, other go routines can get scheduled to it but it still has the device set to whatever it was before while it was locked so its possible that the following sequence happens: 1. NTT domain is initialized on thread 2 via a goroutine on device 0 2. MSM multiGPU test runs and is locked on thread 3 setting its device to 1 3. Other tests run concurrently on threads other than 3 (since it is locked) 4. MSM multiGPU test finishes and release thread 3 back to the pool but its device is still 1 5. NTT test runs and is assigned to thread 3 --> this will fail because the thread’s device wasn’t released back We really only want to set a thread's device while the thread is locked. But once we unlock a thread, it’s device should return to whatever it was set at originally. In theory, it should always be 0 if `SetDevice` is never used outside of `RunOnDevice` - which it shouldn’t be in most situations	2024-05-08 14:07:29 +03:00
Otsar	a56435d2e8	Updated hall of fame (#505 ) ## Describe the changes Adds Patrick to Hall of fame	2024-05-07 14:41:38 +03:00
Stas	41294b12e0	Stas/example poly (#434 ) ## Describe the changes Added examples for Poly API --------- Co-authored-by: Yuval Shekel <yshekel@gmail.com>	2024-05-07 11:52:13 +03:00
Jeremy Felder	6134cfe177	[DOCS]: Tidy up docs (#502 ) ## Describe the changes This PR tidies up docs and updates golang build instructions	2024-05-06 15:35:19 +03:00
VitaliiH	34f0212c0d	rust classic benches with Criterion for ecntt/msm/ntt (#499 ) Rust idiomatic benches for EC NTT, NTT, MSM	2024-05-05 10:28:41 +02:00
release-bot	f6758f3447	Bump rust crates' version icicle-babybear@2.1.0 icicle-bls12-377@2.1.0 icicle-bls12-381@2.1.0 icicle-bn254@2.1.0 icicle-bw6-761@2.1.0 icicle-core@2.1.0 icicle-cuda-runtime@2.1.0 icicle-grumpkin@2.1.0 icicle-hash@2.1.0 icicle-stark252@2.1.0 Generated by cargo-workspaces	2024-05-01 20:11:42 +00:00
nonam3e	e2ad621f97	Nonam3e/golang/keccak (#496 ) ## Describe the changes This PR adds keccak bindings + passes cfg as reference in keccak cuda functions	2024-05-01 14:08:33 +03:00
PatStiles	bdc3da98d6	FEAT(stark252 field): Adds Stark252 curve (#494 ) ## Describe the changes Adds support for the stark252 base field.	2024-05-01 14:08:05 +03:00
yshekel	36e288c1fa	fix: bug regarding MixedRadix coset (I)NTT for NM/MN ordering (#497 ) The bug is in how twiddles array is indexed when multiplied by a mixed (M) vector to implement (I)NTT on cosets. The fix is to use the DIF-digit-reverse to compute the index of the element in the natural (N) vector that moved to index 'i' in the M vector. This is emulating a DIT-digit-reverse (which is mixing like a DIF-compute) reorder of the twiddles array and element-wise multiplication without reordering the twiddles memory.	2024-04-25 18:09:27 +03:00
nonam3e	f8d15e2613	update imports in golang bindings (#498 ) ## Describe the changes This PR updates imports in golang bindings to the v2 version	2024-04-25 03:46:14 +07:00
release-bot	14b39b57cc	Bump rust crates' version icicle-babybear@2.0.1 icicle-bls12-377@2.0.1 icicle-bls12-381@2.0.1 icicle-bn254@2.0.1 icicle-bw6-761@2.0.1 icicle-core@2.0.1 icicle-cuda-runtime@2.0.1 icicle-grumpkin@2.0.1 icicle-hash@2.0.1 Generated by cargo-workspaces	2024-04-24 07:13:05 +00:00
Jeremy Felder	999167afe1	[PATCH]: Update module with v2 versioning (#495 ) ## Describe the changes This PR fixes the issue of v2 ICICLE not being discovered by Go's packaging service by adding the required "v2" to the module path: https://go.dev/doc/modules/release-workflow#breaking	2024-04-24 10:09:45 +03:00
release-bot	ff374fcac7	Bump rust crates' version icicle-babybear@2.0.0 icicle-bls12-377@2.0.0 icicle-bls12-381@2.0.0 icicle-bn254@2.0.0 icicle-bw6-761@2.0.0 icicle-core@2.0.0 icicle-cuda-runtime@2.0.0 icicle-grumpkin@2.0.0 icicle-hash@2.0.0 Generated by cargo-workspaces	2024-04-23 02:30:18 +00:00
ChickenLover	7265d18d48	ICICLE V2 Release (#492 ) This PR introduces major updates for ICICLE Core, Rust and Golang bindings --------- Co-authored-by: Yuval Shekel <yshekel@gmail.com> Co-authored-by: DmytroTym <dmytrotym1@gmail.com> Co-authored-by: Otsar <122266060+Otsar-Raikou@users.noreply.github.com> Co-authored-by: VitaliiH <vhnatyk@gmail.com> Co-authored-by: release-bot <release-bot@ingonyama.com> Co-authored-by: Stas <spolonsky@icloud.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: ImmanuelSegol <3ditds@gmail.com> Co-authored-by: JimmyHongjichuan <45908291+JimmyHongjichuan@users.noreply.github.com> Co-authored-by: pierre <pierreuu@gmail.com> Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: nonam3e <timur@ingonyama.com> Co-authored-by: Vlad <88586482+vladfdp@users.noreply.github.com> Co-authored-by: LeonHibnik <leon@ingonyama.com> Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com> Co-authored-by: vladfdp <vlad.heintz@gmail.com>	2024-04-23 05:26:40 +03:00