Bump rust crates' version

icicle-babybear@2.8.0 icicle-bls12-377@2.8.0 icicle-bls12-381@2.8.0 icicle-bn254@2.8.0 icicle-bw6-761@2.8.0 icicle-core@2.8.0 icicle-cuda-runtime@2.8.0 icicle-grumpkin@2.8.0 icicle-hash@2.8.0 icicle-m31@2.8.0 icicle-stark252@2.8.0 Generated by cargo-workspaces
Feat/roman/hash docs (#556 )
2026-01-13 01:17:57 -05:00 · 2024-07-16 13:57:56 +00:00 · 2024-07-16 16:39:35 +03:00 · 2024-07-15 15:31:12 +07:00 · 2024-07-11 13:46:25 +07:00 · 2024-07-08 10:39:50 +03:00
404 changed files with 20510 additions and 7729 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -3,8 +3,11 @@ golang:
  - wrappers/golang/**/*.h
  - wrappers/golang/**/*.tmpl
  - go.mod
+  - .github/workflows/golang.yml
 rust:
  - wrappers/rust/**/*
+  - '!wrappers/rust/README.md'
+  - .github/workflows/rust.yml
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
@@ -12,4 +15,11 @@ cpp:
  - icicle/**/*.hpp
  - icicle/**/*.c
  - icicle/**/*.h
-  - icicle/CMakeLists.txt
+  - icicle/CMakeLists.txt
+  - .github/workflows/cpp_cuda.yml  
+  - icicle/cmake/Common.cmake
+  - icicle/cmake/CurvesCommon.cmake
+  - icicle/cmake/FieldsCommon.cmake
+examples:
+  - examples/**/*
+  - .github/workflows/examples.yml
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -12,6 +12,9 @@ on:
      cpp_cuda:
        description: "Flag for if C++/CUDA files changed"
        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
+      examples:
+        description: "Flag for if example files changed"
+        value: ${{ jobs.check-changed-files.outputs.examples }}

 jobs:
  check-changed-files:
@@ -21,6 +24,7 @@ jobs:
      golang: ${{ steps.changed_files.outputs.golang }}
      rust: ${{ steps.changed_files.outputs.rust }}
      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+      examples: ${{ steps.changed_files.outputs.examples }}
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -37,3 +41,4 @@ jobs:
        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "examples=${{ steps.changed-files-yaml.outputs.examples_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -35,7 +35,18 @@ jobs:
    needs: [check-changed-files, check-format]
    strategy:
      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
+        curve:
+          - name: bn254
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bls12_381
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bls12_377
+            build_args: -DG2=ON -DECNTT=ON
+          - name: bw6_761
+            build_args: -DG2=ON -DECNTT=ON
+          - name: grumpkin
+            build_args:
+
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -44,7 +55,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -S . -B build
        cmake --build build -j
    - name: Run C++ curve Tests
      working-directory: ./icicle/build/tests
@@ -57,7 +68,13 @@ jobs:
    needs: [check-changed-files, check-format]
    strategy:
      matrix:
-        field: [babybear]
+        field: 
+          - name: babybear
+            build_args: -DEXT_FIELD=ON
+          - name: stark252
+            build_args: -DEXT_FIELD=OFF
+          - name: m31
+            build_args: -DEXT_FIELD=ON
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -66,7 +83,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field.name }} ${{ matrix.field.build_args }} -S . -B build
        cmake --build build -j
    - name: Run C++ field Tests
      working-directory: ./icicle/build/tests
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -33,7 +33,7 @@ jobs:
      uses: actions/checkout@v4
    - name: c++ examples
      working-directory: ./examples/c++
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +47,7 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
-      if: needs.check-changed-files.outputs.rust == 'true'
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -34,7 +34,7 @@ jobs:
      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi

  build-curves-linux:
-    name: Build curves on Linux
+    name: Build and test curves on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
@@ -60,19 +60,18 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
+      # builds a single curve with the curve's specified build args
+      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/curves
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
-          icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
-        retention-days: 1
+      run: |
+        CURVE=$(echo ${{ matrix.curve.name }} | sed -e 's/_//g')
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test ./$CURVE/tests -count=1 -failfast -p 2 -timeout 60m -v
 
  build-fields-linux:
-    name: Build fields on Linux
+    name: Build and test fields on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
@@ -90,18 +89,18 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
+      # builds a single field with the fields specified build args
+      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/fields
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
-        retention-days: 1
+      run: |
+        FIELD=$(echo ${{ matrix.field.name }} | sed -e 's/_//g')
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test ./$FIELD/tests -count=1 -failfast -p 2 -timeout 60m -v
    
  build-hashes-linux:
-    name: Build hashes on Linux
+    name: Build and test hashes on Linux
    runs-on: [self-hosted, Linux, X64, icicle]
    needs: [check-changed-files, check-format]
    strategy:
@@ -119,41 +118,15 @@ jobs:
    - name: Build
      working-directory: ./wrappers/golang
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }} # builds a single hash algorithm
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
+      # builds a single hash algorithm with the hash's specified build args
+      run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }}
+    - name: Test
+      working-directory: ./wrappers/golang/hash
      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.hash.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_hash.a
-        retention-days: 1
-  
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-curves-linux, build-fields-linux, build-hashes-linux]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Download ICICLE lib artifacts
-      uses: actions/download-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        path: ./icicle/build/lib
-        merge-multiple: true
-    - name: Run Tests
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # -count ensures the test results are not cached
-      # -p controls the number of programs that can be run in parallel
      run: |
+        HASH=$(echo ${{ matrix.hash.name }} | sed -e 's/_//g')
        export CPATH=$CPATH:/usr/local/cuda/include
-        go test ./... -count=1 -failfast -p 2 -timeout 60m
+        go test ./$HASH/tests -count=1 -failfast -p 2 -timeout 60m -v
  
  # TODO: bw6 on windows requires more memory than the standard runner has
  # Add a large runner and then enable this job
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -62,8 +62,8 @@ jobs:
      # We need to limit the number of threads to avoid running out of memory on weaker machines
      # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
      run: |
-        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored
-        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2 --ignored
+        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --exclude icicle-m31 --release --verbose --features=g2 -- --test-threads=2

    - name: Run baby bear tests
      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
@@ -79,26 +79,34 @@ jobs:
        cargo test --release --verbose -- --ignored
        cargo test --release --verbose

-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
+    - name: Run m31 tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-m31
      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
+      run: |
+        cargo test --release --verbose -- --ignored
+        cargo test --release --verbose
+
+  # build-windows:
+  #   name: Build on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v4
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     id: cuda-toolkit
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+  #   - name: Build targets
+  #     working-directory: ./wrappers/rust
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     env:
+  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+  #       CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
+  #     # Building from the root workspace will build all members of the workspace by default
+  #     run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/*'
+      - 'docs/**'

 jobs:
  test-deploy:
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.so
 *.nsys-rep
 *.ncu-rep
+*.sage.py
 **/target
 **/.vscode
 **/.*lock*csv#
@@ -17,5 +18,3 @@
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
 **/build*
-**/icicle/appUtils/large_ntt/work
-icicle/appUtils/large_ntt/work/test_ntt
--- a/docs/docs/icicle/golang-bindings/keccak.md
+++ b/docs/docs/icicle/golang-bindings/keccak.md
@@ -0,0 +1,94 @@
+# Keccak
+
+## Keccak Example
+
+```go
+package main
+
+import (
+	"encoding/hex"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/hash/keccak"
+)
+
+func createHostSliceFromHexString(hexString string) core.HostSlice[uint8] {
+	byteArray, err := hex.DecodeString(hexString)
+	if err != nil {
+		panic("Not a hex string")
+	}
+	return core.HostSliceFromElements([]uint8(byteArray))
+}
+
+func main() {
+	input := createHostSliceFromHexString("1725b6")
+	outHost256 := make(core.HostSlice[uint8], 32)
+
+	cfg := keccak.GetDefaultHashConfig()
+	e := keccak.Keccak256(input, int32(input.Len()), 1, outHost256, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak256 hashing failed")
+	}
+
+	outHost512 := make(core.HostSlice[uint8], 64)
+	e = keccak.Keccak512(input, int32(input.Len()), 1, outHost512, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak512 hashing failed")
+	}
+
+    numberOfBlocks := 3
+	outHostBatch256 := make(core.HostSlice[uint8], 32*numberOfBlocks)
+	e = keccak.Keccak256(input, int32(input.Len()/numberOfBlocks), int32(numberOfBlocks), outHostBatch256, &cfg)
+	if e.CudaErrorCode != cr.CudaSuccess {
+		panic("Keccak256 batch hashing failed")
+	}
+}
+```
+
+## Keccak Methods
+
+```go
+func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
+func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
+```
+
+### Parameters
+
+- **`input`**: A slice containing the input data for the Keccak256 hash function. It can reside in either host memory or device memory.
+- **`inputBlockSize`**: An integer specifying the size of the input data for a single hash.
+- **`numberOfBlocks`**: An integer specifying the number of results in the hash batch.
+- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
+- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.
+
+### Return Value
+
+- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
+
+## HashConfig
+
+The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+
+```go
+type HashConfig struct {
+	Ctx                cr.DeviceContext
+	areInputsOnDevice  bool
+	areOutputsOnDevice bool
+	IsAsync            bool
+}
+```
+
+### Fields
+
+- **`Ctx`**: Device context containing details like device id and stream.
+- **`areInputsOnDevice`**: Indicates if input data is located on the device.
+- **`areOutputsOnDevice`**: Indicates if output hash is stored on the device.
+- **`IsAsync`**: If true, runs the Keccak256/Keccak512 operation asynchronously.
+
+### Default Configuration
+
+Use `GetDefaultHashConfig` to obtain a default configuration, which can then be customized as needed.
+
+```go
+func GetDefaultHashConfig() HashConfig
+```
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -4,9 +4,9 @@ To understand the theory behind MSM pre computation technique refer to Niall Emm

 ## Core package

-### MSM PrecomputeBases
+### MSM PrecomputePoints

-`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.
+`PrecomputePoints` and `G2PrecomputePoints` exists for all supported curves.

 #### Description

@@ -14,21 +14,20 @@ This function extends each provided base point $(P)$ with its multiples $(2^lP,

 The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.

-#### `PrecomputeBases`
+#### `PrecomputePoints`

-Precomputes bases for MSM by extending each base point with its multiples.
+Precomputes points for MSM by extending each base point with its multiples.

 ```go
-func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
 ```

 ##### Parameters

 - **`points`**: A slice of the original affine points to be extended with their multiples.
- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
- **`c`**: Currently unused; reserved for future compatibility.
- **`ctx`**: CUDA device context specifying the execution environment.
- **`outputBases`**: The device slice allocated for storing the extended bases.
+- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
+- **`outputBases`**: The device slice allocated for storing the extended points.

 ##### Example

@@ -50,28 +49,27 @@ func main() {
 	var precomputeOut core.DeviceSlice
 	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())

-	err := bn254.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	err := bn254.PrecomputePoints(points, 1024, &cfg, precomputeOut)
 	if err != cr.CudaSuccess {
 		log.Fatalf("PrecomputeBases failed: %v", err)
 	}
 }
 ```

-#### `G2PrecomputeBases`
+#### `G2PrecomputePoints`

-This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
+This method is the same as `PrecomputePoints` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.

 ```go
-func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
 ```

 ##### Parameters

- **`points`**: A slice of G2 curve points to be extended.
- **`precomputeFactor`**: The total number of points to precompute for each base.
- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
- **`ctx`**: Specifies the CUDA device context for execution.
- **`outputBases`**: Allocated device slice for the extended bases.
+- **`points`**: A slice of the original affine points to be extended with their multiples.
+- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
+- **`outputBases`**: The device slice allocated for storing the extended points.

 ##### Example

@@ -93,20 +91,9 @@ func main() {
 	var precomputeOut core.DeviceSlice
 	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())

-	err := g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	err := g2.G2PrecomputePoints(points, 1024, 0, &cfg, precomputeOut)
 	if err != cr.CudaSuccess {
 		log.Fatalf("PrecomputeBases failed: %v", err)
 	}
 }
 ```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -6,52 +6,53 @@
 package main

 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
 )

 func main() {
-  // Obtain the default MSM configuration.
-  cfg := bn254.GetDefaultMSMConfig()
+	// Obtain the default MSM configuration.
+	cfg := core.GetDefaultMSMConfig()

-  // Define the size of the problem, here 2^18.
-  size := 1 << 18
+	// Define the size of the problem, here 2^18.
+	size := 1 << 18

-  // Generate scalars and points for the MSM operation.
-  scalars := bn254.GenerateScalars(size)
-  points := bn254.GenerateAffinePoints(size)
+	// Generate scalars and points for the MSM operation.
+	scalars := bn254.GenerateScalars(size)
+	points := bn254.GenerateAffinePoints(size)

-  // Create a CUDA stream for asynchronous operations.
-  stream, _ := cr.CreateStream()
-  var p bn254.Projective
+	// Create a CUDA stream for asynchronous operations.
+	stream, _ := cr.CreateStream()
+	var p bn254.Projective

-  // Allocate memory on the device for the result of the MSM operation.
-  var out core.DeviceSlice
-  _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+	// Allocate memory on the device for the result of the MSM operation.
+	var out core.DeviceSlice
+	_, e := out.MallocAsync(p.Size(), p.Size(), stream)

-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}

-  // Set the CUDA stream in the MSM configuration.
-  cfg.Ctx.Stream = &stream
-  cfg.IsAsync = true
+	// Set the CUDA stream in the MSM configuration.
+	cfg.Ctx.Stream = &stream
+	cfg.IsAsync = true

-  // Perform the MSM operation.
-  e = bn254.Msm(scalars, points, &cfg, out)
+	// Perform the MSM operation.
+	e = bn254_msm.Msm(scalars, points, &cfg, out)

-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}

-  // Allocate host memory for the results and copy the results from the device.
-  outHost := make(core.HostSlice[bn254.Projective], 1)
-  cr.SynchronizeStream(&stream)
-  outHost.CopyFromDevice(&out)
+	// Allocate host memory for the results and copy the results from the device.
+	outHost := make(core.HostSlice[bn254.Projective], 1)
+	cr.SynchronizeStream(&stream)
+	outHost.CopyFromDevice(&out)

-  // Free the device memory allocated for the results.
-  out.Free()
+	// Free the device memory allocated for the results.
+	out.Free()
 }

 ```
@@ -121,7 +122,7 @@ func GetDefaultMSMConfig() MSMConfig

 ## How do I toggle between the supported algorithms?

-When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
+When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle reduction and `cfg.Ctx.IsBigTriangle = false` will activate iterative reduction.

 ```go
 ...
@@ -151,6 +152,10 @@ out.Malloc(batchSize*p.Size(), p.Size())
 ...
 ```

+## Parameters for optimal performance
+
+Please refer to the [primitive description](../primitives/msm#choosing-optimal-parameters)
+
 ## Support for G2 group

 To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled as described in the [Golang building instructions](../golang-bindings.md#using-icicle-golang-bindings-in-your-project).
@@ -169,23 +174,23 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
 package main

 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-  g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )

 func main() {
-  cfg := bn254.GetDefaultMSMConfig()
-  size := 1 << 12
-  batchSize := 3
-  totalSize := size * batchSize
-  scalars := bn254.GenerateScalars(totalSize)
-  points := g2.G2GenerateAffinePoints(totalSize)
+	cfg := core.GetDefaultMSMConfig()
+	size := 1 << 12
+	batchSize := 3
+	totalSize := size * batchSize
+	scalars := bn254.GenerateScalars(totalSize)
+	points := g2.G2GenerateAffinePoints(totalSize)

-  var p g2.G2Projective
-  var out core.DeviceSlice
-  out.Malloc(batchSize*p.Size(), p.Size())
-  g2.G2Msm(scalars, points, &cfg, out)
+	var p g2.G2Projective
+	var out core.DeviceSlice
+	out.Malloc(batchSize*p.Size(), p.Size())
+	g2.G2Msm(scalars, points, &cfg, out)
 }

 ```
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -1,5 +1,9 @@
 # Polynomial API Overview

+:::note
+Read our paper on the Polynomials API in ICICLE v2 by clicking [here](https://eprint.iacr.org/2024/973).
+:::
+
 ## Introduction

 The Polynomial API offers a robust framework for polynomial operations within a computational environment. It's designed for flexibility and efficiency, supporting a broad range of operations like arithmetic, evaluation, and manipulation, all while abstracting from the computation and storage specifics. This enables adaptability to various backend technologies, employing modern C++ practices.
@@ -128,12 +132,13 @@ auto H = (A*B-C).divide_by_vanishing_polynomial(N);

 ### Evaluation

-Evaluate polynomials at arbitrary domain points or across a domain.
+Evaluate polynomials at arbitrary domain points, across a domain or on a roots-of-unity domain.

 ```cpp
 Image operator()(const Domain& x) const; // evaluate f(x)
 void evaluate(const Domain* x, Image* evals /*OUT*/) const;
 void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory
 ```

 Example:
@@ -147,18 +152,13 @@ uint64_t domain_size = ...;
 auto domain = /*build domain*/; // host or device memory
 auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
 f.evaluate_on_domain(domain, domain_size, evaluations);
+
+// evaluate f(x) on roots of unity domain
+uint64_t domain_log_size = ...;
+auto evaluations_rou_domain = std::make_unique<scalar_t[]>(1 << domain_log_size); // can be device memory too
+f.evaluate_on_rou_domain(domain_log_size, evaluations_rou_domain);
 ```

-:::note
-For special domains such as roots of unity, this method is not the most efficient for two reasons:
-
- Need to build the domain of size N.
- The implementation is not trying to identify this special domain.
-
-Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
-See the 'device views' section for more details.
-:::
-
 ### Manipulations

 Beyond arithmetic, the API supports efficient polynomial manipulations:
@@ -255,7 +255,7 @@ auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);

 #### Views

-The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
+The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.

 ##### What is a Memory View?

@@ -265,7 +265,7 @@ A memory view is essentially a pointer to data stored in device memory. By provi

 Memory views are extremely versatile and can be employed in various computational contexts such as:

- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
+- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM).
 - **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.

 ##### Obtaining and Using Views
@@ -275,9 +275,6 @@ To create and use views within the Polynomial API, functions are provided to obt
 ```cpp
 // Obtain a view of the polynomial's coefficients
 std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
-std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
 ```

 Example usage:
@@ -328,22 +325,7 @@ if (coeff_view.isValid()) {
 }
 ```

-#### Evaluations View: Accessing Polynomial Evaluations Efficiently

-The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
-This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
-
-:::warning
-Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
-:::
-
-```cpp
-// Assume a polynomial `p` of degree N
-auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
-
-// Use the evaluations view to perform further computations or visualizations
-process_polynomial_evaluations(evals_view.get(), size, device_id);
-```

 ## Multi-GPU Support with CUDA Backend

--- a/docs/docs/icicle/primitives/keccak.md
+++ b/docs/docs/icicle/primitives/keccak.md
@@ -0,0 +1,75 @@
+# Keccak
+
+[Keccak](https://keccak.team/files/Keccak-implementation-3.2.pdf) is a cryptographic hash function designed by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van Assche. It was selected as the winner of the NIST hash function competition, becoming the basis for the [SHA-3 standard](https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf).
+
+Keccak operates on a message input of any length and produces a fixed-size hash output. The hash function is built upon the sponge construction, which involves absorbing the input data followed by squeezing out the hash value.
+
+At its core, Keccak consists of a permutation function operating on a state array. The permutation function employs a round function that operates iteratively on the state array. Each round consists of five main steps:
+
+- **Theta:** This step introduces diffusion by performing a bitwise XOR operation between the state and a linear combination of its neighboring columns.
+- **Rho:** This step performs bit rotation operations on each lane of the state array.
+- **Pi:** This step rearranges the positions of the lanes in the state array.
+- **Chi:** This step applies a nonlinear mixing operation to each lane of the state array.
+- **Iota:** This step introduces a round constant to the state array.
+
+## Using Keccak
+
+ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree or running multiple hashes in parallel.
+
+### Supported Bindings
+
+- [Golang](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/hash/keccak)
+- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
+
+### Example usage
+
+This is an example of running 1024 Keccak-256 hashes in parallel, where input strings are of size 136 bytes:
+
+```rust
+use icicle_core::hash::HashConfig;
+use icicle_cuda_runtime::memory::HostSlice;
+use icicle_hash::keccak::keccak256;
+
+let config = HashConfig::default();
+let input_block_len = 136;
+let number_of_hashes = 1024;
+
+let preimages = vec![1u8; number_of_hashes * input_block_len];
+let mut digests = vec![0u8; number_of_hashes * 64];
+
+let preimages_slice = HostSlice::from_slice(&preimages);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+keccak256(
+    preimages_slice,
+    input_block_len as u32,
+    number_of_hashes as u32,
+    digests_slice,
+    &config,
+)
+.unwrap();
+```
+
+### Merkle Tree
+
+You can build a keccak merkle tree using the corresponding functions:
+
+```rust
+use icicle_core::tree::{merkle_tree_digests_len, TreeBuilderConfig};
+use icicle_cuda_runtime::memory::HostSlice;
+use icicle_hash::keccak::build_keccak256_merkle_tree;
+
+let mut config = TreeBuilderConfig::default();
+config.arity = 2;
+let height = 22;
+let input_block_len = 136;
+let leaves = vec![1u8; (1 << height) * input_block_len];
+let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
+
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+build_keccak256_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
+```
+
+In the example above, a binary tree of height 22 is being built. Each leaf is considered to be a 136 byte long array. The leaves and digests are aligned in a flat array. You can also use keccak512 in `build_keccak512_merkle_tree` function.
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -54,36 +54,142 @@ You can learn more about how MSMs work from this [video](https://www.youtube.com
 - [Golang](../golang-bindings/msm.md)
 - [Rust](../rust-bindings//msm.md)

-## Supported algorithms
+## Algorithm description

-Our MSM implementation supports two algorithms `Bucket accumulation` and `Large triangle accumulation`.
+We follow the bucket method algorithm. The GPU implementation consists of four phases:

-### Bucket accumulation
+1. Preparation phase - The scalars are split into smaller scalars of `c` bits each. These are the bucket indices. The points are grouped according to their corresponding bucket index and the buckets are sorted by size.
+2. Accumulation phase - Each bucket accumulates all of its points using a single thread. More than one thread is assigned to large buckets, in proportion to their size. A bucket is considered large if its size is above the large bucket threshold that is determined by the `large_bucket_factor` parameter. The large bucket threshold is the expected average bucket size times the `large_bucket_factor` parameter.
+3. Buckets Reduction phase - bucket results are multiplied by their corresponding bucket number and each bucket module is reduced to a small number of final results. By default, this is done by an iterative algorithm which is highly parallel. Setting `is_big_triangle` to `true` will switch this phase to the running sum algorithm described in the above YouTube talk which is much less parallel.
+4. Final accumulation phase - The final results from the last phase are accumulated using the double-and-add algorithm.

-The Bucket Accumulation algorithm is a method of dividing the overall MSM task into smaller, more manageable sub-tasks. It involves partitioning scalars and their corresponding points into different "buckets" based on the scalar values.
+## Batched MSM

-Bucket Accumulation can be more parallel-friendly because it involves dividing the computation into smaller, independent tasks, distributing scalar-point pairs into buckets and summing points within each bucket. This division makes it well suited for parallel processing on GPUs.
+The MSM supports batch mode - running multiple MSMs in parallel. It's always better to use the batch mode instead of running single msms in serial as long as there is enough memory available. We support running a batch of MSMs that share the same points as well as a batch of MSMs that use different points.

-#### When should I use Bucket accumulation?
+## MSM configuration

-In scenarios involving large MSM computations with many scalar-point pairs, the ability to parallelize operations makes Bucket Accumulation more efficient. The larger the MSM task, the more significant the potential gains from parallelization.
+```cpp
+  /**
+   * @struct MSMConfig
+   * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
+   * is to create it using [default_msm_config](@ref default_msm_config) function and then you'll hopefully only need to
+   * change a small number of default values for each of your MSMs.
+   */
+  struct MSMConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    int points_size;         /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be
+                              *   a number of different points. So, if each MSM re-uses the same set of points, this
+                              *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
+                              *   points, it should be set to the product of MSM size and [batch_size](@ref
+                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
+    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
+                              *   [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
+                              *   there needs to be equal to the one used here. Larger values decrease the
+                              *   number of computations to make, on-line memory footprint, but increase the static
+                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
+    int c;                   /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
+                              *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
+                              *   means more on-line memory footprint but also more parallelism and less computational
+                              *   complexity (up to a certain point). Currently pre-computation is independent of
+                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
+                              *   [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
+                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
+    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
+                              *   but if a different (better) upper bound is known, it should be reflected in this
+                              *   variable. Default value: 0 (set to the bitsize of scalar field). */
+    int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur
+                              *   very frequently. Useful for efficient treatment of non-uniform distributions of
+                              *   scalars and "top windows" with few bits. Can be set to 0 to disable separate
+                              *   treatment of large buckets altogether. Default value: 10. */
+    int batch_size;          /**< The number of MSMs to compute. Default value: 1. */
+    bool are_scalars_on_device;       /**< True if scalars are on device and false if they're on host. Default value:
+                                       *   false. */
+    bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
+                                       *   true. */
+    bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
+    bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
+                                      *   Default value: true. */
+    bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
+                                 *   to false, `is_async` won't take effect because a synchronization is needed to
+                                 *   transfer results to the host. Default value: false. */
+    bool is_big_triangle;       /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
+                                 *   but also greatly decreases parallelism, so only suitable for large batches of MSMs.
+                                 *   Default value: false. */
+    bool is_async;              /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
+                                 *   function will block the current CPU thread. */
+  };
+```

-### Large triangle accumulation
+## Choosing optimal parameters

-Large Triangle Accumulation is a method for optimizing MSM which focuses on reducing the number of point doublings in the computation. This algorithm is based on the observation that the number of point doublings can be minimized by structuring the computation in a specific manner.
+`is_big_triangle` should be `false` in almost all cases. It might provide better results only for very small MSMs (smaller than 2^8^) with a large batch (larger than 100) but this should be tested per scenario.
+Large buckets exist in two cases:
+1. When the scalar distribution isn't uniform.
+2. When `c` does not divide the scalar bit-size.

-#### When should I use Large triangle accumulation?
+`large_bucket_factor` that is equal to 10 yields good results for most cases, but it's best to fine tune this parameter per `c` and per scalar distribution.
+The two most important parameters for performance are `c` and the `precompute_factor`. They affect the number of EC additions as well as the memory size. When the points are not known in advance we cannot use precomputation. In this case the best `c` value is usually around $log_2(msmSize) - 4$. However, in most protocols the points are known in advanced and precomputation can be used unless limited by memory. Usually it's best to use maximum precomputation (such that we end up with only a single bucket module) combined we a `c` value around $log_2(msmSize) - 1$.

-The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a **large batch of smaller MSM computations**.
+## Memory usage estimation

-## MSM Modes
+The main memory requirements of the MSM are the following:

-ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
+- Scalars - `sizeof(scalar_t) * msm_size * batch_size`
+- Scalar indices - `~6 * sizeof(unsigned) * nof_bucket_modules * msm_size * batch_size`
+- Points - `sizeof(affine_t) * msm_size * precomp_factor * batch_size`
+- Buckets - `sizeof(projective_t) * nof_bucket_modules * 2^c * batch_size`

-Batch MSM allows you to run many MSMs with a single API call while single MSM will launch a single MSM computation.
+where `nof_bucket_modules =  ceil(ceil(bitsize / c) / precompute_factor)`

-### Which mode should I use?
+During the MSM computation first the memory for scalars and scalar indices is allocated, then the indices are freed and points and buckets are allocated. This is why a good estimation for the required memory is the following formula:

-This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs simultaneously, leveraging the parallel processing capabilities of GPUs.
+$max(scalars + scalarIndices, scalars + points + buckets)$

-Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
+This gives a good approximation within 10% of the actual required memory for most cases.
+
+## Example parameters
+
+Here is a useful table showing optimal parameters for different MSMs. They are optimal for BLS12-377 curve when running on NVIDIA GeForce RTX 3090 Ti. This is the configuration used:
+
+```cpp
+  msm::MSMConfig config = {
+    ctx,            // DeviceContext
+    N,              // points_size
+    precomp_factor, // precompute_factor
+    user_c,         // c
+    0,              // bitsize
+    10,             // large_bucket_factor
+    batch_size,     // batch_size
+    false,          // are_scalars_on_device
+    false,          // are_scalars_montgomery_form
+    true,           // are_points_on_device
+    false,          // are_points_montgomery_form
+    true,           // are_results_on_device
+    false,          // is_big_triangle
+    true            // is_async
+  };
+```
+
+Here are the parameters and the results for the different cases:
+
+| MSM size | Batch size | Precompute factor | c | Memory estimation (GB) | Actual memory (GB) | Single MSM time (ms) |
+| --- | --- | --- | --- | --- | --- | --- |
+| 10 | 1 | 1 | 9 | 0.00227 | 0.00277 | 9.2 |
+| 10 | 1 | 23 | 11 | 0.00259 | 0.00272 | 1.76 |
+| 10 | 1000 | 1 | 7 | 0.94 | 1.09 | 0.051 |
+| 10 | 1000 | 23 | 11 | 2.59 | 2.74 | 0.025 |
+| 15 | 1 | 1 | 11 | 0.011 | 0.019 | 9.9 |
+| 15 | 1 | 16 | 16 | 0.061 | 0.065 | 2.4 |
+| 15 | 100 | 1 | 11 | 1.91 | 1.92 | 0.84 |
+| 15 | 100 | 19 | 14 | 6.32 | 6.61 | 0.56 |
+| 18 | 1 | 1 | 14 | 0.128 | 0.128 | 14.4 |
+| 18 | 1 | 15 | 17 | 0.40 | 0.42 | 5.9 |
+| 22 | 1 | 1 | 17 | 1.64 | 1.65 | 68 |
+| 22 | 1 | 13 | 21 | 5.67 | 5.94 | 54 |
+| 24 | 1 | 1 | 18 | 6.58 | 6.61 | 232 |
+| 24 | 1 | 7 | 21 | 12.4 | 13.4 | 199 |
+
+The optimal values can vary per GPU and per curve. It is best to try a few combinations until you get the best results for your specific case.
--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -8,4 +8,5 @@ This section of the documentation is dedicated to the ICICLE primitives, we will

 - [MSM](./msm.md)
 - [NTT](./ntt.md)
+- [Keccak Hash](./keccak.md)
 - [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -16,7 +16,7 @@ Poseidon starts with the initialization of its internal state, which is composed

 This is done to prevent collisions and to prevent certain cryptographic attacks by ensuring that the internal state is sufficiently mixed and unpredictable.

-![Alt text](image.png)
+![Poseidon initialization of internal state added with pre-generated round constants](https://github.com/ingonyama-zk/icicle/assets/122266060/52257f5d-6097-47c4-8f17-7b6449b9d162)

 ## Applying full and partial rounds

@@ -26,9 +26,9 @@ To generate a secure hash output, the algorithm goes through a series of "full r

 ### Full rounds

-![Alt text](image-1.png)
+![Full round iterations consisting of S box operations, adding round constants, and a Full MDS matrix multiplication](https://github.com/ingonyama-zk/icicle/assets/122266060/e4ce0e98-b90b-4261-b83e-3cd8cce069cb)

-**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the the implementation and user configuration.
+**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the implementation and user configuration.

 **Linear Transformation:** After applying the S-box, a linear transformation is performed on the state. This involves multiplying the state by a MDS (Maximum Distance Separable) Matrix. which further diffuses the transformations applied by the S-box across the entire state.

@@ -36,14 +36,14 @@ To generate a secure hash output, the algorithm goes through a series of "full r

 ### Partial Rounds

+![Partial round iterations consisting of selective S box operation, adding a round constant and performing an MDS multiplication with a sparse matrix](https://github.com/ingonyama-zk/icicle/assets/122266060/e8c198b4-7aa4-4b4d-9ec4-604e39e07692)
+
 **Selective Application of S-Box:** Partial rounds apply the S-box transformation to only one element of the internal state per round, rather than to all elements. This selective application significantly reduces the computational complexity of the hash function without compromising its security. The choice of which element to apply the S-box to can follow a specific pattern or be fixed, depending on the design of the hash function.

 **Linear Transformation and Round Constants:** A linear transformation is performed and round constants are added. The linear transformation in partial rounds can be designed to be less computationally intensive (this is done by using a sparse matrix) than in full rounds, further optimizing the function's efficiency.

 The user of Poseidon can often choose how many partial or full rounds he wishes to apply; more full rounds will increase security but degrade performance. The choice and balance is highly dependent on the use case.

-![Alt text](image-2.png)
-
 ## Using Poseidon

 ICICLE Poseidon is implemented for GPU and parallelization is performed for each element of the state rather than for each state.
@@ -53,13 +53,14 @@ So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elem

 ### Supported Bindings

+[`Go`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/curves/bn254/poseidon/poseidon.go)
 [`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)

 ### Constants

 Poseidon is extremely customizable and using different constants will produce different hashes, security levels and performance results.

-We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.
+We support pre-calculated and optimized constants for each of the [supported curves](../core#supported-curves-and-operations).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.

 If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants/generate_parameters.py).

@@ -91,8 +92,6 @@ primitive_element = 7 # bls12-381
 # primitive_element = 15 # bw6-761
 ```

-We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.
-
 ### Rust API

 This is the most basic way to use the Poseidon API.
@@ -101,71 +100,58 @@ This is the most basic way to use the Poseidon API.
 let test_size = 1 << 10;
 let arity = 2u32;
 let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
-let config = PoseidonConfig::default();
+let poseidon = Poseidon::load(arity, &ctx).unwrap();
+let config = HashConfig::default();

 let inputs = vec![F::one(); test_size * arity as usize];
 let outputs = vec![F::zero(); test_size];
 let mut input_slice = HostOrDeviceSlice::on_host(inputs);
 let mut output_slice = HostOrDeviceSlice::on_host(outputs);

-poseidon_hash_many::<F>(
+poseidon.hash_many::<F>(
    &mut input_slice,
    &mut output_slice,
    test_size as u32,
    arity as u32,
-    &constants,
+    1, // Output length
    &config,
 )
 .unwrap();
 ```

-The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.
+The `HashConfig` can be modified, by default the inputs and outputs are set to be on `Host` for example.

 ```rust
-impl<'a> Default for PoseidonConfig<'a> {
+impl<'a> Default for HashConfig<'a> {
    fn default() -> Self {
        let ctx = get_default_device_context();
        Self {
            ctx,
            are_inputs_on_device: false,
            are_outputs_on_device: false,
-            input_is_a_state: false,
-            aligned: false,
-            loop_state: false,
            is_async: false,
        }
    }
 }
 ```

-In the example above `load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
+In the example above `Poseidon::load(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.

 ```rust
 let ctx = get_default_device_context();
-    let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
-    let constants_file = PathBuf::from(cargo_manifest_dir)
-        .join("tests")
-        .join(format!("{}_constants.bin", field_prefix));
-    let mut constants_buf = vec![];
-    File::open(constants_file)
-        .unwrap()
-        .read_to_end(&mut constants_buf)
-        .unwrap();
-
-    let mut custom_constants = vec![];
-    for chunk in constants_buf.chunks(field_bytes) {
-        custom_constants.push(F::from_bytes_le(chunk));
-    }
-
-    let custom_constants = create_optimized_poseidon_constants::<F>(
-        arity as u32,
-        &ctx,
-        full_rounds_half,
-        partial_rounds,
-        &mut custom_constants,
-    )
-    .unwrap();
+let custom_poseidon = Poseidon::new(
+    arity, // The arity of poseidon hash. The width will be equal to arity + 1
+    alpha, // The S-box power
+    full_rounds_half,
+    partial_rounds,
+    round_constants,
+    mds_matrix, 
+    non_sparse_matrix,
+    sparse_matrices,
+    domain_tag,
+    ctx,
+)
+.unwrap();
 ```

 ## The Tree Builder
@@ -175,21 +161,34 @@ The tree builder allows you to build Merkle trees using Poseidon.
 You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.

 ```rust
-let height = 20;
-let arity = 2;
-let leaves = vec![F::one(); 1 << (height - 1)];
-let mut digests = vec![F::zero(); merkle_tree_digests_len(height, arity)];
-
-let mut leaves_slice = HostOrDeviceSlice::on_host(leaves);
-
-let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
+use icicle_bn254::tree::Bn254TreeBuilder;
+use icicle_bn254::poseidon::Poseidon;

 let mut config = TreeBuilderConfig::default();
-config.keep_rows = 1;
-build_poseidon_merkle_tree::<F>(&mut leaves_slice, &mut digests, height, arity, &constants, &config).unwrap();
+let arity = 2;
+config.arity = arity as u32;
+let input_block_len = arity;
+let leaves = vec![F::one(); (1 << height) * arity];
+let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];

-println!("Root: {:?}", digests[0..1][0]);
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+let ctx = device_context::DeviceContext::default();
+let hash = Poseidon::load(2, &ctx).unwrap();
+
+let mut config = TreeBuilderConfig::default();
+config.keep_rows = 5;
+Bn254TreeBuilder::build_merkle_tree(
+    leaves_slice,
+    digests_slice,
+    height,
+    input_block_len,
+    &hash,
+    &hash,
+    &config,
+)
+.unwrap();
 ```

 Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig::default()`
--- a/docs/docs/icicle/primitives/poseidon2.md
+++ b/docs/docs/icicle/primitives/poseidon2.md
@@ -0,0 +1,88 @@
+# Poseidon2
+
+[Poseidon2](https://eprint.iacr.org/2023/323) is a recently released optimized version of Poseidon1. The two versions differ in two crucial points. First, Poseidon is a sponge hash function, while Poseidon2 can be either a sponge or a compression function depending on the use case. Secondly, Poseidon2 is instantiated by new and more efficient linear layers with respect to Poseidon. These changes decrease the number of multiplications in the linear layer by up to 90% and the number of constraints in Plonk circuits by up to 70%. This makes Poseidon2 currently the fastest arithmetization-oriented hash function without lookups.
+
+
+## Using Poseidon2
+
+ICICLE Poseidon2 is implemented for GPU and parallelization is performed for each state.
+We calculate multiple hash-sums over multiple pre-images in parallel, rather than going block by block over the input vector.
+
+For example, for Poseidon2 of width 16, input rate 8, output elements 8 and input of size 1024 * 8, we would expect 1024 * 8 elements of output. Which means each input block would be of size 8, resulting in 1024 Poseidon2 hashes being performed.
+
+### Supported Bindings
+
+[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon2)
+
+### Constants
+
+Poseidon2 is also extremely customizable and using different constants will produce different hashes, security levels and performance results.
+
+We support pre-calculated constants for each of the [supported curves](../core#supported-curves-and-operations). The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon2/constants) and are labeled clearly per curve `<curve_name>_poseidon2.h`.
+
+You can also use your own set of constants as shown [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs#L290)
+
+### Rust API
+
+This is the most basic way to use the Poseidon2 API.
+
+```rust
+let test_size = 1 << 10;
+let width = 16;
+let rate = 8;
+let ctx = get_default_device_context();
+let poseidon = Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
+let config = HashConfig::default();
+
+let inputs = vec![F::one(); test_size * rate as usize];
+let outputs = vec![F::zero(); test_size];
+let mut input_slice = HostOrDeviceSlice::on_host(inputs);
+let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+
+poseidon.hash_many::<F>(
+    &mut input_slice,
+    &mut output_slice,
+    test_size as u32,
+    rate as u32,
+    8, // Output length
+    &config,
+)
+.unwrap();
+```
+
+In the example above `Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();` is used to load the correct constants based on width and curve. Here, the default MDS matrices and diffusion are used. If you want to get a Plonky3 compliant version, set them to `MdsType::Plonky` and `DiffusionStrategy::Montgomery` respectively.
+
+## The Tree Builder
+
+Similar to Poseidon1, you can use Poseidon2 in a tree builder.
+
+```rust
+use icicle_bn254::tree::Bn254TreeBuilder;
+use icicle_bn254::poseidon2::Poseidon2;
+
+let mut config = TreeBuilderConfig::default();
+let arity = 2;
+config.arity = arity as u32;
+let input_block_len = arity;
+let leaves = vec![F::one(); (1 << height) * arity];
+let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
+
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+let ctx = device_context::DeviceContext::default();
+let hash = Poseidon2::load(arity, arity, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
+
+let mut config = TreeBuilderConfig::default();
+config.keep_rows = 5;
+Bn254TreeBuilder::build_merkle_tree(
+    leaves_slice,
+    digests_slice,
+    height,
+    input_block_len,
+    &hash,
+    &hash,
+    &config,
+)
+.unwrap();
+```
--- a/docs/docs/icicle/rust-bindings/keccak.md
+++ b/docs/docs/icicle/rust-bindings/keccak.md
@@ -0,0 +1,96 @@
+# Keccak
+
+## Keccak Example
+
+```rust
+use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
+use icicle_hash::keccak::{keccak256, HashConfig};
+use rand::{self, Rng};
+
+fn main() {
+    let mut rng = rand::thread_rng();
+    let initial_data: Vec<u8> = (0..120).map(|_| rng.gen::<u8>()).collect();
+    println!("initial data: {}", hex::encode(&initial_data));
+    let input = HostSlice::<u8>::from_slice(initial_data.as_slice());
+    let mut output = DeviceVec::<u8>::cuda_malloc(32).unwrap();
+
+    let mut config = HashConfig::default();
+    keccak256(input, initial_data.len() as i32, 1, &mut output[..], &mut config).expect("Failed to execute keccak256 hashing");
+
+    let mut output_host = vec![0_u8; 32];
+    output.copy_to_host(HostSlice::from_mut_slice(&mut output_host[..])).unwrap();
+
+    println!("keccak256 result: {}", hex::encode(&output_host));
+}
+```
+
+## Keccak Methods
+
+```rust
+pub fn keccak256(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: i32,
+    number_of_blocks: i32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &mut HashConfig,
+) -> IcicleResult<()>
+
+pub fn keccak512(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: i32,
+    number_of_blocks: i32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &mut HashConfig,
+) -> IcicleResult<()> 
+```
+
+### Parameters
+
+- **`input`**: A slice containing the input data for the Keccak256 hash function. It can reside in either host memory or device memory.
+- **`input_block_size`**: An integer specifying the size of the input data for a single hash.
+- **`number_of_blocks`**: An integer specifying the number of results in the hash batch.
+- **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
+- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.
+
+### Return Value
+
+- **`IcicleResult`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.
+
+## HashConfig
+
+The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+
+```rust
+pub struct HashConfig<'a> {
+    pub ctx: DeviceContext<'a>,
+    pub are_inputs_on_device: bool,
+    pub are_outputs_on_device: bool,
+    pub is_async: bool,
+}
+```
+
+### Fields
+
+- **`ctx`**: Device context containing details like device id and stream.
+- **`are_inputs_on_device`**: Indicates if input data is located on the device.
+- **`are_outputs_on_device`**: Indicates if output hash is stored on the device.
+- **`is_async`**: If true, runs the Keccak256/Keccak512 operation asynchronously.
+
+### Usage
+
+Example initialization with default settings:
+
+```rust
+let default_config = HashConfig::default();
+```
+
+Customizing the configuration:
+
+```rust
+let custom_config = NTTConfig {
+    ctx: custom_device_context,
+    are_inputs_on_device: true,
+    are_outputs_on_device: true,
+    is_async: false,
+};
+```
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -2,26 +2,24 @@

 To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).

-## `precompute_bases`
+## `precompute_points`

 Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.

 ```rust
-pub fn precompute_bases<C: Curve + MSM<C>>(
-    points: &HostOrDeviceSlice<Affine<C>>,
-    precompute_factor: i32,
-    _c: i32,
-    ctx: &DeviceContext,
-    output_bases: &mut HostOrDeviceSlice<Affine<C>>,
+pub fn precompute_points<C: Curve + MSM<C>>(
+    points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
+    msm_size: i32,
+    cfg: &MSMConfig,
+    output_bases: &mut DeviceSlice<Affine<C>>,
 ) -> IcicleResult<()>
 ```

 ### Parameters

 - **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
+- **`msm_size`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
 - **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.

 #### Returns
@@ -37,22 +35,11 @@ The precomputation process is crucial for optimizing MSM operations, especially
 #### Example Usage

 ```rust
-let device_context = DeviceContext::default_for_device(0); // Use the default device
+let cfg = MSMConfig::default();
 let precompute_factor = 4; // Number of points to precompute
 let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");

 // Precompute the bases using the specified factor
-precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
+precompute_points(&points, msm_size, &cfg, &mut extended_bases)
    .expect("Failed to precompute bases");
 ```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -100,7 +100,7 @@ When performing MSM operations, it's crucial to match the size of the `scalars`

 ## How do I toggle between the supported algorithms?

-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
+When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle reduction and `is_big_triangle=false` will activate iterative reduction.

 ```rust
 ...
@@ -144,6 +144,10 @@ msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();

 Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).

+## Parameters for optimal performance
+
+Please refer to the [primitive description](../primitives/msm#choosing-optimal-parameters)
+
 ## Support for G2 group

 MSM also supports G2 group.
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -180,7 +180,7 @@ where

 - **`IcicleResult<()>`**: Will return an error if the operation fails.

-### Releaseing the domain
+### Releasing the domain

 The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.

--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -67,6 +67,9 @@ where
        evals: &mut E,
    );

+    // Method to evaluate the polynomial over the roots-of-unity domain for power-of-two sized domain
+    fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, domain_log_size: u64, evals: &mut E);
+
    // Method to retrieve a coefficient at a specific index.
    fn get_coeff(&self, idx: u64) -> Self::Field;

@@ -228,6 +231,11 @@ let f_x = f.eval(&x);  // Evaluate f at x
 let domain = [one, two, three];
 let mut host_evals = vec![ScalarField::zero(); domain.len()];
 f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
+
+// Evaluate on roots-of-unity-domain
+let domain_log_size = 4;
+let mut device_evals = DeviceVec::<ScalarField>::cuda_malloc(1 << domain_log_size).unwrap();
+f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
 ```

 ### Read coefficients
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -163,8 +163,17 @@ const config = {
      prism: {
        theme: lightCodeTheme,
        darkTheme: darkCodeTheme,
+        additionalLanguages: ['rust', 'go'],
      },
      image: 'img/logo.png',
+      announcementBar: {
+        id: 'announcement', // Any value that will identify this message.
+        content:
+          '<strong>🎉 Read our paper on the Polynomials API in ICICLE v2 by clicking <a target="_blank" rel="noopener noreferrer" href="https://eprint.iacr.org/2024/973">here</a>! 🎉</strong>',
+        backgroundColor: '#ADD8E6', // Light blue background color.
+        textColor: '#000000', // Black text color.
+        isCloseable: true, // Defaults to `true`.
+      },
    }),
 };

--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -3680,6 +3680,8 @@
      "version": "8.12.0",
      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+      "optional": true,
+      "peer": true,
      "dependencies": {
        "fast-deep-equal": "^3.1.1",
        "json-schema-traverse": "^1.0.0",
@@ -3694,7 +3696,9 @@
    "node_modules/ajv-formats/node_modules/json-schema-traverse": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+      "optional": true,
+      "peer": true
    },
    "node_modules/ajv-keywords": {
      "version": "3.5.2",
@@ -16340,14 +16344,13 @@
      "version": "2.1.1",
      "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz",
      "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==",
-      "requires": {
-        "ajv": "^8.0.0"
-      },
+      "requires": {},
      "dependencies": {
        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
+          "version": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+          "optional": true,
+          "peer": true,
          "requires": {
            "fast-deep-equal": "^3.1.1",
            "json-schema-traverse": "^1.0.0",
@@ -16358,7 +16361,9 @@
        "json-schema-traverse": {
          "version": "1.0.0",
          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
+          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+          "optional": true,
+          "peer": true
        }
      }
    },
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -43,11 +43,21 @@ module.exports = {
              label: "NTT",
              id: "icicle/primitives/ntt",
            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/primitives/keccak",
+            },
            {
              type: "doc",
              label: "Poseidon Hash",
              id: "icicle/primitives/poseidon",
            },
+            {
+              type: "doc",
+              label: "Poseidon2 Hash",
+              id: "icicle/primitives/poseidon2",
+            },
          ],
        },
        {
@@ -100,6 +110,11 @@ module.exports = {
              label: "Vector operations",
              id: "icicle/golang-bindings/vec-ops",
            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/golang-bindings/keccak",
+            },
            {
              type: "doc",
              label: "Multi GPU Support",
@@ -147,6 +162,11 @@ module.exports = {
              label: "Vector operations",
              id: "icicle/rust-bindings/vec-ops",
            },
+            {
+              type: "doc",
+              label: "Keccak Hash",
+              id: "icicle/rust-bindings/keccak",
+            },
            {
              type: "doc",
              label: "Multi GPU Support",
--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -1,6 +1,6 @@
 # ZKContainer

-We recommend using [ZKContainer](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+We recommend using [ZKContainer](https://www.ingonyama.com/blog/product-announcement-zk-containers), where we have already preinstalled all the required dependencies, to run Icicle examples. 
 To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).

 In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`. 
--- a/examples/c++/best-practice-ntt/CMakeLists.txt
+++ b/examples/c++/best-practice-ntt/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/best-practice-ntt/README.md
+++ b/examples/c++/best-practice-ntt/README.md
@@ -0,0 +1,33 @@
+# ICICLE best practices: Concurrent Data Transfer and NTT Computation
+
+The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.
+
+## Key-Takeaway
+
+When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.
+
+Typically, you concurrently
+
+1. Download the output of a previous NTT back to the host
+2. Upload the input for a next NTT on the device
+3. Run current NTT
+
+> [!NOTE]
+> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
+
+## Best-Practices
+
+1. Use three separate CUDA streams for Download, Upload, and Compute operations
+2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
+3. Use in-place NTT to save on device memory.
+
+## Running the example
+
+To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
+
+```sh
+./compile.sh
+./run.sh
+```
+
+To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
--- a/examples/c++/best-practice-ntt/compile.sh
+++ b/examples/c++/best-practice-ntt/compile.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
+
--- a/examples/c++/best-practice-ntt/example.cu
+++ b/examples/c++/best-practice-ntt/example.cu
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <chrono>
+
+#include "curves/params/bn254.cuh"
+#include "api/bn254.h"
+using namespace bn254;
+using namespace ntt;
+
+const std::string curve = "BN254";
+
+typedef scalar_t S;
+typedef scalar_t E;
+
+const unsigned max_log_ntt_size = 27;
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
+    elements[i] = E::from(i + 1);
+  }
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char** argv)
+{
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId = 0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+
+  std::cout << "Curve: " << curve << std::endl;
+
+  S basic_root = S::omega(max_log_ntt_size);
+
+  // change these parameters to match the desired NTT size and batch size
+  const unsigned log_ntt_size = 22;
+  const unsigned nof_ntts = 16;
+
+  std::cout << "log NTT size: " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+
+  std::cout << "Batch size: " << nof_ntts << std::endl;
+
+  // Create separate CUDA streams for overlapping data transfers and kernel execution.
+  cudaStream_t stream_compute, stream_h2d, stream_d2h;
+  cudaStreamCreate(&stream_compute);
+  cudaStreamCreate(&stream_h2d);
+  cudaStreamCreate(&stream_d2h);
+
+  // Create device context for NTT computation
+  auto ctx_compute = device_context::DeviceContext{
+    stream_compute, // stream
+    0,              // device_id
+    0,              // mempool
+  };
+
+  // Initialize NTT domain and configuration
+  bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
+  NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
+  config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
+  config_compute.batch_size = nof_ntts;
+  config_compute.are_inputs_on_device = true;
+  config_compute.are_outputs_on_device = true;
+  config_compute.is_async = true;
+
+  std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
+  int nof_blocks = 32;
+  std::cout << "Number of blocks: " << nof_blocks << std::endl;
+  int block_size = ntt_size * nof_ntts / nof_blocks;
+
+  // on-host pinned data
+  E* h_inp[2];
+  E* h_out[2];
+  for (int i = 0; i < 2; i++) {
+    cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+  }
+
+  // on-device in-place data
+  // we need two on-device vectors to overlap data transfers with NTT kernel execution
+  E* d_vec[2];
+  for (int i = 0; i < 2; i++) {
+    cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
+  }
+
+  // initialize input data
+  initialize_input(ntt_size, nof_ntts, h_inp[0]);
+  initialize_input(ntt_size, nof_ntts, h_inp[1]);
+
+  cudaEvent_t compute_start, compute_stop;
+  cudaEventCreate(&compute_start);
+  cudaEventCreate(&compute_stop);
+
+  for (int run = 0; run < 10; run++) {
+    int vec_compute = run % 2;
+    int vec_transfer = (run + 1) % 2;
+    std::cout << "Run: " << run << std::endl;
+    std::cout << "Compute Vector: " << vec_compute << std::endl;
+    std::cout << "Transfer Vector: " << vec_transfer << std::endl;
+    START_TIMER(inplace);
+    cudaEventRecord(compute_start, stream_compute);
+    bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
+    cudaEventRecord(compute_stop, stream_compute);
+    // we have to delay upload to device relative to download from device by one block: preserve write after read
+    for (int i = 0; i <= nof_blocks; i++) {
+      if (i < nof_blocks) {
+        cudaMemcpyAsync(
+          &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
+          cudaMemcpyDeviceToHost, stream_d2h);
+      }
+      if (i > 0) {
+        cudaMemcpyAsync(
+          &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
+          sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
+      }
+      // synchronize upload and download at the end of the block to ensure data integrity
+      cudaStreamSynchronize(stream_d2h);
+      cudaStreamSynchronize(stream_h2d);
+    }
+    // synchronize compute stream with the end of the computation
+    cudaEventSynchronize(compute_stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
+    END_TIMER(inplace, "Concurrent In-Place  NTT");
+    std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
+  };
+
+  // Clean-up
+  for (int i = 0; i < 2; i++) {
+    cudaFree(d_vec[i]);
+    cudaFreeHost(h_inp[i]);
+    cudaFreeHost(h_out[i]);
+  }
+  cudaEventDestroy(compute_start);
+  cudaEventDestroy(compute_stop);
+  cudaStreamDestroy(stream_compute);
+  cudaStreamDestroy(stream_d2h);
+  cudaStreamDestroy(stream_h2d);
+  return 0;
+}
--- a/examples/c++/best-practice-ntt/run.sh
+++ b/examples/c++/best-practice-ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -16,7 +16,7 @@ int main(int argc, char* argv[])
  int N = batch_size * msm_size;

  std::cout << "Part I: use G1 points" << std::endl;
-  
+
  std::cout << "Generating random inputs on-host" << std::endl;
  scalar_t* scalars = new scalar_t[N];
  affine_t* points = new affine_t[N];
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
    false, // is_async
  };
  config.batch_size = batch_size;
-  
+
  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
  cudaStream_t stream = config.ctx.stream;
  // Execute the MSM kernel
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -6,140 +6,147 @@
 #include "api/bn254.h"
 #include "gpu-utils/error_handler.cuh"

+#include "poseidon/poseidon.cuh"
+#include "hash/hash.cuh"
+
 using namespace poseidon;
 using namespace bn254;

-void checkCudaError(cudaError_t error) {
-    if (error != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
-        // Handle the error, e.g., exit the program or throw an exception.
-    }
+void checkCudaError(cudaError_t error)
+{
+  if (error != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+    // Handle the error, e.g., exit the program or throw an exception.
+  }
 }

 // these global constants go into template calls
 const int size_col = 11;

-// this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    // CHK_IF_RETURN(); I can't use it in a standard thread function
-    PoseidonConfig column_config = {
-        ctx,   // ctx
-        false, // are_inputes_on_device
-        false, // are_outputs_on_device
-        false, // input_is_a_state
-        false, // aligned
-        false, // loop_state
-        false, // is_async
-        };
-    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
-    checkCudaError(err);
+void threadPoseidon(
+  device_context::DeviceContext ctx,
+  unsigned size_partition,
+  scalar_t* layers,
+  scalar_t* column_hashes,
+  Poseidon<scalar_t> * poseidon)
+{
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  HashConfig column_config = default_hash_config(ctx);
+  cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
+  checkCudaError(err);
 }

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

+#define CHECK_ALLOC(ptr)                                                                                               \
+  if ((ptr) == nullptr) {                                                                                              \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl;                                              \
+    exit(EXIT_FAILURE);                                                                                                \
+  }

 #define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
    exit(EXIT_FAILURE); \
 }

-int main() {
-    const unsigned size_row = (1<<30);
-    const unsigned nof_partitions = 64;
-    const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to reuse for different partitions
-    const uint32_t size_layers = size_col * size_partition;
-    
-    nvmlInit();
-    unsigned int deviceCount;
-    nvmlDeviceGetCount(&deviceCount);
-    std::cout << "Available GPUs: " << deviceCount << std::endl;
+int main()
+{
+  const unsigned size_row = (1 << 30);
+  const unsigned nof_partitions = 64;
+  const unsigned size_partition = size_row / nof_partitions;
+  // layers is allocated only for one partition, need to reuse for different partitions
+  const uint32_t size_layers = size_col * size_partition;

-    for (unsigned int i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        nvmlMemory_t memory;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-        nvmlDeviceGetHandleByIndex(i, &device);
-        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        nvmlDeviceGetMemoryInfo(device, &memory);
-        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
-    }
+  nvmlInit();
+  unsigned int deviceCount;
+  nvmlDeviceGetCount(&deviceCount);
+  std::cout << "Available GPUs: " << deviceCount << std::endl;

-    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
-    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+  for (unsigned int i = 0; i < deviceCount; ++i) {
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    nvmlDeviceGetHandleByIndex(i, &device);
+    nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+    nvmlDeviceGetMemoryInfo(device, &memory);
+    std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
+              << "/" << memory.free / 1024 / 1024 << std::endl;
+  }

-    //===============================================================================
-    // Key: multiple devices are supported by device context
-    //===============================================================================
+  const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
+  std::cout << "Required Memory (MiB) " << memory_partition << std::endl;

-    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-    ctx0.device_id=0;
-    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-    ctx1.device_id=1;
-    
-    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers0);
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers1);
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
+  //===============================================================================
+  // Key: multiple devices are supported by device context
+  //===============================================================================

-    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash0);
-    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash1);
+  device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+  ctx0.device_id = 0;
+  device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+  ctx1.device_id = 1;

-    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
+  std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+  scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers0);
+  scalar_t s = scalar_t::zero();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers0[i] = s;
+    s = s + scalar_t::one();
+  }
+  scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers1);
+  s = scalar_t::zero() + scalar_t::one();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers1[i] = s;
+    s = s + scalar_t::one();
+  }
+
+  scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash0);
+  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash1);
+
+    Poseidon<scalar_t> column_poseidon0(size_col, ctx0);
    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
    if (err_result != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
        return; 
    }
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+    Poseidon<scalar_t> column_poseidon1(size_col, ctx1);

-    std::cout << "Parallel execution of Poseidon threads" << std::endl;
-    START_TIMER(parallel);
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+  std::cout << "Parallel execution of Poseidon threads" << std::endl;
+  START_TIMER(parallel);
+  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
+  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);

-    // Wait for the threads to finish
-    thread0.join();
-    thread1.join();
-    END_TIMER(parallel,"2 GPUs");
-    std::cout << "Output Data from Thread 0: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 1: ";
-    std::cout << column_hash1[0] << std::endl;
+  // Wait for the threads to finish
+  thread0.join();
+  thread1.join();
+  END_TIMER(parallel, "2 GPUs");
+  std::cout << "Output Data from Thread 0: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 1: ";
+  std::cout << column_hash1[0] << std::endl;

-    std::cout << "Sequential execution of Poseidon threads" << std::endl;
-    START_TIMER(sequential);
-    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    thread2.join();
-    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
-    thread3.join();
-    END_TIMER(sequential,"1 GPU");
-    std::cout << "Output Data from Thread 2: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 3: ";
-    std::cout << column_hash1[0] << std::endl;
+  std::cout << "Sequential execution of Poseidon threads" << std::endl;
+  START_TIMER(sequential);
+  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
+  thread2.join();
+  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
+  thread3.join();
+  END_TIMER(sequential, "1 GPU");
+  std::cout << "Output Data from Thread 2: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 3: ";
+  std::cout << column_hash1[0] << std::endl;

-    nvmlShutdown();
-    return 0;
+  nvmlShutdown();
+  return 0;
 }
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -17,7 +17,7 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont
  config.is_a_on_device = true;
  config.is_b_on_device = true;
  config.is_result_on_device = true;
-  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -100,7 +100,7 @@ int main(int argc, char** argv)
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }
-  
+
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
@@ -151,7 +151,7 @@ int main(int argc, char** argv)
  // validate multiplication here...

  // clean up and exit
-  free(host_in1); 
+  free(host_in1);
  free(host_in2);
  free(host_out);
  cudaFree(device_in1);
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -60,8 +60,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char* argv[])
 {
@@ -89,16 +89,16 @@ int main(int argc, char* argv[])
  bn254_initialize_domain(&basic_root, ctx, true);
  // Create an NTTConfig instance
  NTTConfig<S> config = default_ntt_config<S>();
-  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  config.ntt_algorithm = NttAlgorithm::MixedRadix;
  config.batch_size = nof_ntts;
  START_TIMER(MixedRadix);
  cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(MixedRadix, "MixedRadix NTT");
-  
+
  std::cout << "Validating output" << std::endl;
  validate_output(ntt_size, nof_ntts, output);

-  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  config.ntt_algorithm = NttAlgorithm::Radix2;
  START_TIMER(Radix2);
  err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(Radix2, "Radix2 NTT");
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -11,49 +11,47 @@ using namespace bn254;
 typedef point_field_t T;

 // modular power
-T modPow(T base, T exp) {
+T modPow(T base, T exp)
+{
  T r = T::one();
  T b = base;
  T e = exp;
  while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
+    // If exp is odd, multiply the base with result
+    if (T::is_odd(e)) { r = r * b; }
+    // Now exp must be even, divide it by 2
+    e = T::div2(e);
+    b = b * b;
  }
  return r;
 }

 // Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
+bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); }

 // modular square root adapted from:
 // https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
+bool mySQRT(T a, T* result)
+{
  if (a == T::zero()) {
    *result = T::zero();
    return true;
  }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
+  if (modPow(a, T::div2(T::zero() - T::one())) != T::one()) {
    return false; // solution does not exist
  }
  // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
+  // if (p % 4 == 3) return power(a, (p + 1) / 4, p);
+  T s = T::zero() - T::one(); // p - 1,
+  T n = T::one() + T::one();  // 2;
+  T r = T::zero();
  T m;
  while (T::is_even(s)) {
    r = r + T::one();
-    s = T::div2(s); //s /= 2;
+    s = T::div2(s); // s /= 2;
  }
  // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
+  while (modPow(n, T::div2((T::zero() - T::one()))) != T::zero() - T::one()) {
    n = n + T::one();
  }
  T x = modPow(a, T::div2(s + T::one()));
@@ -61,83 +59,86 @@ bool mySQRT(T a, T *result) {
  T g = modPow(n, s);
  for (;; r = m) {
    T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
+    for (m = T::zero(); T::lt(m, r) /* m < r*/ && t != T::one(); m = m + T::one())
+      t = t * t;
+    if (m == T::zero()) {
      *result = x;
      return true;
    }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
+    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()));
+    g = gs * gs;
+    x = x * gs;
+    b = b * g;
  }
 }

-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
+void point_near_x(T x, affine_t* point)
+{
+  const T wb = T{weierstrass_b};
  T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
+  while (y2 = x * x * x + wb, quadratic_residue(y2) == false) {
    x = x + T::one();
  };
  T y;
  bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
+  assert(y * y == y2);
  point->x = x;
  point->y = y;
 }

 static int seed = 0;
-static T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
+static HOST_INLINE T rand_host_seed()
+{
+  std::mt19937_64 generator(seed++);
+  std::uniform_int_distribution<unsigned> distribution;
+
+  T value;
+  for (unsigned i = 0; i < T::TLC - 1; i++)
    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
+    value.limbs_storage.limbs[i] = distribution(generator);
+  // while (lt(Field{get_modulus()}, value))
+  //   value = value - Field{get_modulus()};
+  return value;
+}

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char** argv)
 {
  const unsigned N = pow(2, 10);
  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
+  T* xs = new T[N + 1];
+
  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
+  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment"
+            << std::endl;
  seed = 1234;
  std::cout << "Using seed: " << seed << std::endl;
  std::cout << "Generating random field values" << std::endl;
  START_TIMER(gen);
-  
+
  for (unsigned i = 0; i < N; i++) {
    xs[i] = rand_host_seed();
  }
  END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
+  std::cout << "xs[0]: " << xs[0] << std::endl;
+  std::cout << "xs[1]: " << xs[1] << std::endl;
+
  // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
+  affine_t* points = new affine_t[N + 1];
  std::cout << "Generating point about random field values" << std::endl;
  START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
+  for (unsigned i = 0; i < N + 1; i++) {
    point_near_x(xs[i], &points[i]);
  }
  END_TIMER(points, "Time to generate points");
-  
+
  std::cout << "Generating commitment vector" << std::endl;
  projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
+  scalar_t* scalars = new scalar_t[N + 1];
  scalar_t::rand_host_many(scalars, N);

  std::cout << "Generating salt" << std::endl;
@@ -146,7 +147,7 @@ int main(int argc, char** argv)
  std::cout << "Executing MSM" << std::endl;
  auto config = msm::default_msm_config();
  START_TIMER(msm);
-  bn254_msm_cuda(scalars, points, N+1, config, &result);
+  bn254_msm_cuda(scalars, points, N + 1, config, &result);
  END_TIMER(msm, "Time to execute MSM");

  std::cout << "Computed commitment: " << result << std::endl;
--- a/examples/c++/polynomial-api/CMakeLists.txt
+++ b/examples/c++/polynomial-api/CMakeLists.txt
@@ -23,5 +23,8 @@ set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_include_directories(example PRIVATE "../../../icicle/include")

 # can link to another curve/field by changing the following lib and FIELD_ID
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+target_link_libraries(example 
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a
+)
 target_compile_definitions(example PUBLIC FIELD_ID BN254)
--- a/examples/c++/polynomial-api/example.cu
+++ b/examples/c++/polynomial-api/example.cu
@@ -1,13 +1,14 @@
 #include <iostream>
-
+#include <cassert>
 #include "polynomials/polynomials.h"
 #include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
 #include "ntt/ntt.cuh"
-#include "poseidon/tree/merkle.cuh"

-// using namespace field_config;
+#include "api/bn254.h"
+#include <chrono>
+
 using namespace polynomials;
-using namespace merkle;
+using namespace bn254;

 // define the polynomial type
 typedef Polynomial<scalar_t> Polynomial_t;
@@ -21,6 +22,27 @@ const auto four = scalar_t::from(4);
 const auto five = scalar_t::from(5);
 const auto minus_one = zero - one;

+static std::unique_ptr<scalar_t[]> generate_pows(scalar_t tau, uint32_t size){
+    auto vec = std::make_unique<scalar_t[]>(size);
+    vec[0] = scalar_t::one();
+    for (size_t i = 1; i < size; ++i) {
+      vec[i] = vec[i-1] * tau;
+  }
+  return std::move(vec);
+}
+
+static std::unique_ptr<affine_t[]> generate_SRS(uint32_t size) {
+  auto secret_scalar = scalar_t::rand_host();
+  auto gen = projective_t::generator();
+  auto pows_of_tau = generate_pows(secret_scalar,size);
+  auto SRS = std::make_unique<affine_t[]>(size);
+  for (size_t i = 0; i < size; ++i) {
+      SRS[i] = projective_t::to_affine(pows_of_tau[i] * gen);
+  }
+  return std::move(SRS);
+}
+
+
 void example_evaluate()
 {
  std::cout << std::endl << "Example: Polynomial evaluation on random value" << std::endl;
@@ -133,7 +155,7 @@ void example_multiplication(const int log0, const int log1)
  std::cout << "multiply and evaluate: " << mx << std::endl;
 }

-void example_multiplicationScalar(const int log0)
+void example_multiplication_scalar(const int log0)
 {
  std::cout << std::endl << "Example: Scalar by Polynomial multiplication" << std::endl;
  const int size = 1 << log0;
@@ -163,7 +185,7 @@ void example_monomials()
  std::cout << "Expected f'(x) = " << expected_addmonmon_f_x << std::endl;
 }

-void example_ReadCoeffsToHost()
+void example_read_coeffs_to_host()
 {
  std::cout << std::endl << "Example: Read coefficients to host" << std::endl;
  const scalar_t coeffs_f[3] = {zero, one, two}; // 0+1x+2x^2
@@ -190,7 +212,7 @@ void example_ReadCoeffsToHost()
  }
 }

-void example_divisionSmall()
+void example_division_small()
 {
  std::cout << std::endl << "Example: Polynomial division (small)" << std::endl;
  const scalar_t coeffs_a[4] = {five, zero, four, three}; // 3x^3+4x^2+5
@@ -208,7 +230,7 @@ void example_divisionSmall()
  std::cout << "Reminder: 1:" << r_coeffs[1] << " expected: " << scalar_t::from(3) << std::endl;
 }

-void example_divisionLarge(const int log0, const int log1)
+void example_division_large(const int log0, const int log1)
 {
  std::cout << std::endl << "Example: Polynomial division (large)" << std::endl;
  const int size0 = 1 << log0, size1 = 1 << log1;
@@ -226,7 +248,7 @@ void example_divisionLarge(const int log0, const int log1)
  std::cout << "rhs = " << bx * qx + rx << std::endl;
 }

-void example_divideByVanishingPolynomial()
+void example_divide_by_vanishing_polynomial()
 {
  std::cout << std::endl << "Example: Polynomial division by vanishing polynomial" << std::endl;
  const scalar_t coeffs_v[5] = {minus_one, zero, zero, zero, one}; // x^4-1 vanishes on 4th roots of unity
@@ -254,7 +276,8 @@ void example_clone(const int log0)
  std::cout << "h(x) = " << h(x) << " expected: " << g(x) << std::endl;
 }

-void example_EvenOdd() {
+void example_even_odd()
+{
  std::cout << std::endl << "Example: Split into even and odd powers " << std::endl;
  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
  auto f = Polynomial_t::from_coefficients(coeffs, 4);
@@ -270,38 +293,129 @@ void example_EvenOdd() {
  std::cout << "Odd: 1:" << odd_coeffs[1] << " expected: " << four << std::endl;
 }

-void example_Slice() {
+void example_slice()
+{
  std::cout << std::endl << "Example: Slice polynomial " << std::endl;
  const scalar_t coeffs[4] = {one, two, three, four}; // 1+2x+3x^2+4x^3
  auto f = Polynomial_t::from_coefficients(coeffs, 4);
-  auto f_slice = f.slice(0/=offset/, 3/=stride/, 2*/=size/); // 1+4x
+  auto f_slice = f.slice(0 /*=offset*/, 3 /*= stride*/, 2 /*/= size*/); // 1+4x
  scalar_t slice_coeffs[2] = {0};
  const auto slice_nof_coeffs = f_slice.copy_coeffs(slice_coeffs, 0, 1);
  std::cout << "Slice: 0:" << slice_coeffs[0] << " expected: " << one << std::endl;
  std::cout << "Slice: 1:" << slice_coeffs[1] << " expected: " << four << std::endl;
-} 
+}

-void example_DeviceMemoryView() {
+void example_device_memory_view()
+{
  const int log_size = 6;
  const int size = 1 << log_size;
  auto f = randomize_polynomial(size);
-  auto [d_coeff, N, device_id] = f.get_coefficients_view();
-  // commit coefficients to Merkle tree
-  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  PoseidonConstants<scalar_t> constants;
-  init_optimized_poseidon_constants<scalar_t>(2, ctx, &constants);
-  uint32_t tree_height = log_size + 1;
-  int keep_rows = 0; // keep all rows
-  size_t digests_len = log_size - 1;
-  scalar_t* digests = static_cast<scalar_t*>(malloc(sizeof(scalar_t) * digests_len));
-  TreeBuilderConfig config = default_merkle_config();
-  config.keep_rows = keep_rows;
-  config.are_inputs_on_device = true;
-  build_merkle_tree<scalar_t, (2+1)>(d_coeff.get(), digests, tree_height, constants, config);
-  std::cout << "Merkle tree root: " << digests[0] << std::endl;
-  free(digests);
+  auto [d_coeffs, N, device_id] = f.get_coefficients_view();
+
+  // compute coset evaluations
+  auto coset_evals = std::make_unique<scalar_t[]>(size);
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  ntt_config.are_inputs_on_device = true; // using the device data directly as a view
+  ntt_config.coset_gen = ntt::get_root_of_unity<scalar_t>(size * 2);
+  ntt::ntt(d_coeffs.get(), size, ntt::NTTDir::kForward, ntt_config, coset_evals.get());
 }

+
+void example_commit_with_device_memory_view()
+{
+  //declare time vars
+  std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
+  std::chrono::milliseconds duration;
+
+  std::cout << std::endl << "Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1" << std::endl;
+  std::cout<< "Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1" << std::endl;
+  int N = 1025;
+
+  //generate group elements string of length N: (1, beta,beta^2....,beta^{N-1}). g
+  std::cout << "Setup: Generating mock SRS" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto SRS = generate_SRS(2*N);
+  //Allocate memory on device (points)
+  affine_t* points_d;
+  cudaMalloc(&points_d, sizeof(affine_t)* 2 * N);
+  // copy SRS to device (could have generated on device, but gives an indicator)
+  cudaMemcpy(points_d, SRS.get(), sizeof(affine_t)* 2 * N, cudaMemcpyHostToDevice);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: SRS of length "<< N << " generated and loaded to device. Took: " << duration.count() << " milliseconds" << std::endl;
+  
+  //goal:
+  //test commitment equality [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1
+  //test commitment equality [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1
+  //note: using polyapi to gen scalars: already on device. 
+  std::cout << "Setup: Generating polys (on device) f1,f2 of log degree " << log2(N-1) << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto f1 = randomize_polynomial(N);
+  auto f2 = randomize_polynomial(N);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: Gen poly done. Took: " << duration.count() << " milliseconds" << std::endl;
+ 
+  //deg 2N constraints (f1+f2)^2 + (f1-f2)^2 = 2 (f1^2+ f_2^2)
+  std::cout << "Computing constraints..start "<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto L1 = (f1+f2)*(f1+f2) + (f1-f2)*(f1-f2);
+  auto R1 = scalar_t::from(2) * (f1*f1 + f2*f2);
+  //deg 2N constraints (f1+f2)^2 - (f1-f2)^2 = 4 f1 *f_2
+  auto L2 = (f1+f2)*(f1+f2) - (f1-f2)*(f1-f2);
+  auto R2 = scalar_t::from(4) * f1 * f2;
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Computing constraints..done. Took: " << duration.count() << " milliseconds"<< std::endl;
+  
+  // extract coeff using coeff view
+  auto [viewL1, sizeL1, device_idL1] = L1.get_coefficients_view();
+  auto [viewL2, sizeL2, device_idL2] = L2.get_coefficients_view(); 
+  auto [viewR1, sizeR1, device_idR1] = R1.get_coefficients_view();
+  auto [viewR2, sizeR2, device_idR2] = R2.get_coefficients_view();
+  
+  std::cout << "Computing Commitments with poly view"<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  msm::MSMConfig config = msm::default_msm_config();
+  config.are_points_on_device = true;
+  config.are_scalars_on_device = true;
+ 
+  //host vars (for result)
+  projective_t hL1{}, hL2{}, hR1{}, hR2{};
+
+  //straightforward msm bn254 api: no batching
+  bn254_msm_cuda(viewL1.get(),points_d,N,config,&hL1);
+  bn254_msm_cuda(viewL2.get(),points_d,N,config,&hL2);
+  bn254_msm_cuda(viewR1.get(),points_d,N,config,&hR1);
+  bn254_msm_cuda(viewR2.get(),points_d,N,config,&hR2);
+
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Commitments done. Took: " << duration.count() << " milliseconds"<< std::endl;
+ 
+  //sanity checks
+  auto affL1 = projective_t::to_affine(hL1);
+  auto affR1 = projective_t::to_affine(hR1);
+
+  auto affL2 = projective_t::to_affine(hL2);
+  auto affR2 = projective_t::to_affine(hR2);
+
+ //test commitment equality [(f1+f2)^2 + (f1-f2)^2]_1 = [4 (f_1^2+f_2^2]_1
+  assert(affL1.x==affR1.x && affL1.y==affR1.y);
+  std::cout << "commitment [(f1+f2)^2 + (f1-f2)^2]_1:" << std::endl; 
+  std::cout << "[x: " << affL1.x << ", y: " << affL1.y << "]" << std::endl;
+  std::cout << "commitment [[2 (f_1^2+f_2^2]_1:" <<std::endl;
+  std::cout << "[x: " << affR1.x << ", y: " << affR1.y << "]" << std::endl;
+
+  assert(affL2.x==affR2.x && affL2.y==affR2.y);
+  std::cout << "commitment [(f1+f2)^2 - (f1-f2)^2]_1:"<< std::endl;
+  std::cout << "[x: " << affL2.x << ", y: " << affL2.y << "]" << std::endl;
+  std::cout << "commitment [4 f_1*f_2]_1:"<<std::endl;
+  std::cout << "[x: " << affR2.x << ", y: " << affR2.y << "]" << std::endl;
+}
+
+
+
 int main(int argc, char** argv)
 {
  // Initialize NTT. TODO: can we hide this in the library?
@@ -319,15 +433,16 @@ int main(int argc, char** argv)
  example_addition(12, 17);
  example_addition_inplace(2, 2);
  example_multiplication(15, 12);
-  example_multiplicationScalar(15);
+  example_multiplication_scalar(15);
  example_monomials();
-  example_ReadCoeffsToHost();
-  example_divisionSmall();
-  example_divisionLarge(12, 2);
-  example_divideByVanishingPolynomial();
-  example_EvenOdd();
-  example_Slice();
-  example_DeviceMemoryView();
+  example_read_coeffs_to_host();
+  example_division_small();
+  example_division_large(12, 2);
+  example_divide_by_vanishing_polynomial();
+  example_even_odd();
+  example_slice();
+  example_device_memory_view();
+  example_commit_with_device_memory_view();

  return 0;
 }
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -82,10 +82,10 @@ int main(int argc, char** argv)
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
      vec_ops::VecOpsConfig config{
        ntt_config.ctx,
-        true,  // is_a_on_device
-        true,  // is_b_on_device
-        true,  // is_result_on_device
-        false  // is_async
+        true, // is_a_on_device
+        true, // is_b_on_device
+        true, // is_result_on_device
+        false // is_async
      };
      CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));

--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -4,6 +4,8 @@

 #include "api/bn254.h"
 #include "curves/params/bn254.cuh"
+#include "poseidon/poseidon.cuh"
+#include "hash/hash.cuh"
 using namespace poseidon;
 using namespace bn254;

@@ -14,12 +16,12 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level

 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t> * constants, PoseidonConfig config)
+  const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, HashConfig &config)
 {
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    poseidon.hash_many(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, 1, config);
  }
 }

@@ -37,11 +39,7 @@ uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_he
 }

 void generate_proof(
-  uint32_t position,
-  scalar_t* tree,
-  const uint32_t tree_height,
-  uint32_t* proof_lr,
-  scalar_t* proof_hash)
+  uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash)
 {
  uint32_t level_index = position;
  for (uint32_t level = tree_height - 1; level > 0; level--) {
@@ -68,8 +66,8 @@ uint32_t validate_proof(
  const uint32_t tree_height,
  const uint32_t* proof_lr,
  const scalar_t* proof_hash,
-  PoseidonConstants<scalar_t> * constants,
-  PoseidonConfig config)
+  Poseidon<scalar_t> &poseidon,
+  HashConfig &config)
 {
  scalar_t hashes_in[2], hash_out[1], level_hash;
  level_hash = hash;
@@ -82,7 +80,7 @@ uint32_t validate_proof(
      hashes_in[1] = level_hash;
    }
    // next level hash
-    bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
+    poseidon.hash_many(hashes_in, hash_out, 1, 2, 1, config);
    level_hash = hash_out[0];
  }
  return proof_hash[0] == level_hash;
@@ -112,16 +110,15 @@ int main(int argc, char* argv[])
    d = d + scalar_t::one();
  }
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
-  PoseidonConstants<scalar_t> constants;
-  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity+1); 
-  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
+
+  Poseidon<scalar_t> poseidon(data_arity, ctx);
+  HashConfig config = default_hash_config(ctx); 
+  poseidon.hash_many(data, &tree[tree_index(leaf_level, 0)], tree_width, data_arity, 1, config);

  std::cout << "3. Building Merkle tree" << std::endl;
-  PoseidonConstants<scalar_t> tree_constants;
-  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
-  build_tree(tree_height, tree, &tree_constants, tree_config);
+  Poseidon<scalar_t> tree_poseidon(tree_arity, ctx);
+  HashConfig tree_config = default_hash_config(ctx);
+  build_tree(tree_height, tree, tree_poseidon, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
  uint32_t position = tree_width - 1;
@@ -136,12 +133,12 @@ int main(int argc, char* argv[])
  std::cout << "5. Validate the hash membership" << std::endl;
  uint32_t validated;
  const scalar_t hash = tree[tree_index(leaf_level, query_position)];
-  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
+  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
  std::cout << "Validated: " << validated << std::endl;

  std::cout << "6. Tamper the hash" << std::endl;
  const scalar_t tampered_hash = hash + scalar_t::one();
-  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
+  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
  
  std::cout << "7. Invalidate tamper hash membership" << std::endl;
  std::cout << "Validated: " << validated << std::endl;
--- a/examples/c++/risc0/CMakeLists.txt
+++ b/examples/c++/risc0/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -DFIELD_ID=1001")
+# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(example PRIVATE "../../../icicle/include")
+
+# can link to another curve/field by changing the following lib and FIELD_ID
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_babybear.a)
+# target_compile_definitions(example PUBLIC FIELD_ID babybear)
--- a/examples/c++/risc0/README.md
+++ b/examples/c++/risc0/README.md
@@ -0,0 +1,44 @@
+# ICICLE example: RISC0's Fibonacci sequence proof using Polynomial API
+
+## Why RISC0?
+
+[RISC0 Protocol](https://www.risczero.com/) creates computational integrity proofs (a.k.a. Zero Knowledge Proofs) for programs executing on RISC-V architecture.
+The proofs are created for sequences of values in RISC-V registers, called execution traces.
+This approach is transparent to developers and enables the use of general purpose languages.
+
+## Best-Practices
+
+This example builds on [ICICLE Polynomial API](../polynomial-api/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+RISC0 encodes execution traces into very large polynomials and commits them using Merkle trees.
+FRI speeds-up validation of such commitments by recursively generating smaller polynomials (and trees) from larger ones.
+The key enabler for *recursion* is the *redundancy* of polynomial commitments, hence the use of Reed-Solomon codes.
+
+## Running the example
+
+To run example, from project root directory:
+
+```sh
+cd examples/c++/risc0
+./compile.sh
+./run.sh
+```
+
+## What's in the example
+
+The example follows [STARK by Hand](https://dev.risczero.com/proof-system/stark-by-hand), structured in the following Lessons:
+
+1. The Execution Trace
+2. Rule checks to validate a computation
+3. Padding the Trace
+4. Constructing Trace Polynomials
+5. ZK Commitments of the Trace Data
+6. Constraint Polynomials
+7. Mixing Constraint Polynomials
+8. The Core of the RISC Zero STARK
+9. The DEEP Technique
+10. Mixing (Batching) for FRI
+11. FRI Protocol (Commit Phase)
+12. FRI Protocol (Query Phase)
--- a/examples/c++/risc0/compile.sh
+++ b/examples/c++/risc0/compile.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DFIELD=babybear
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
--- a/examples/c++/risc0/example.cu
+++ b/examples/c++/risc0/example.cu
@@ -0,0 +1,275 @@
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <list>
+
+#include "polynomials/polynomials.h"
+#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+#include "ntt/ntt.cuh"
+
+using namespace polynomials;
+
+// define the polynomial type
+typedef Polynomial<scalar_t> Polynomial_t;
+
+// RISC-V register type
+typedef int64_t rv_t;
+
+// Convert RISC-V registers to Finite Fields
+void to_ff(rv_t* rv, scalar_t* s, size_t n) {
+  for (int i = 0; i < n; ++i) {
+    s[i] = scalar_t::from(rv[i]);
+  }
+}
+
+void p_print(Polynomial_t * p, int logn, scalar_t shift, std::string header = "Print Vector") {
+  std::cout << header << std::endl;
+  auto n = 1 << logn;
+  auto omega = scalar_t::omega(logn);
+  auto x = shift;
+  for (int i = 0; i < n; ++i) {
+    std::cout << i << ": " << (*p)(x) << std::endl;
+    x = x*omega;
+  }
+}
+
+// value to polynomial
+Polynomial_t p_value(scalar_t value) {
+  auto p_value = Polynomial_t::from_coefficients(&value , 1);
+  return p_value;
+}
+
+Polynomial_t p_rotate(Polynomial_t* p, int logn) {
+  // rotate polynomial coefficients right by one position
+  auto n = 1 << logn;
+  auto evaluations_rou_domain = std::make_unique<scalar_t[]>(n);
+  p->evaluate_on_rou_domain(logn, evaluations_rou_domain.get() );
+  scalar_t tmp  = evaluations_rou_domain[n-1];
+  for (int i = n-1; i > 0; --i) {
+    evaluations_rou_domain[i] = evaluations_rou_domain[i-1];
+  }
+  evaluations_rou_domain[0] = tmp;
+  return Polynomial_t::from_rou_evaluations(evaluations_rou_domain.get(), n); 
+}
+
+// mix polynomials (c.f. mix polynomial evaluations)
+Polynomial_t p_mix(Polynomial_t* in[], size_t nmix, scalar_t mix_parameter) {
+  scalar_t factor = mix_parameter;
+  Polynomial_t out = in[0]->clone();
+  for (int i = 1; i < nmix; ++i) {
+    out += factor * (*in[i]);
+    factor = factor * mix_parameter;
+  }
+  return out;
+}
+
+void solve_linear(scalar_t xa, scalar_t ya, scalar_t xb, scalar_t yb, scalar_t * coeffs) {
+  coeffs[1] = (ya - yb) * scalar_t::inverse(xa - xb);
+  coeffs[0] = ya - coeffs[1] * xa;
+}
+
+std::unique_ptr<scalar_t[]> InterpolateOnLargerDomain(Polynomial_t * p, int n, scalar_t shift = scalar_t::one()) {
+  const int deg = p->degree();
+  auto input = std::make_unique<scalar_t[]>(n);
+  // TBD: check if scalar_t constructor initializes to zero
+  for (int i = 0; i < n; ++i) {
+    input[i] = scalar_t::zero();
+  }
+  p->copy_coeffs(input.get(), 0/*start*/, deg);
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  ntt_config.coset_gen = shift;
+  auto evals_h = std::make_unique<scalar_t[]>(n);
+  auto err = ntt::ntt(input.get(), n, ntt::NTTDir::kForward, ntt_config, evals_h.get());
+  return evals_h;
+}
+
+int main(int argc, char** argv)
+{
+  std::cout << "This is an ICICLE C++ implementation of the STARK by Hand Explainer." << std::endl;
+  std::cout << "https://dev.risczero.com/proof-system/stark-by-hand" << std::endl;
+
+  const int logn=3;
+  const int n = 1 << logn;
+  
+  std::cout << "Initializing NTT" << std::endl;
+  static const int MAX_NTT_LOG_SIZE = 24;
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
+  ntt::init_domain(basic_root, ntt_config.ctx);
+  std::cout << "Initializing Polynomials" << std::endl;
+  // Virtual factory design pattern: initializing polynomimals factory for CUDA backend
+  Polynomial_t::initialize(std::make_unique<CUDAPolynomialFactory<>>());
+
+  std::cout << std::endl << "Lesson 1: The Execution Trace" << std::endl; 
+  // Trace: Data Columns
+  rv_t rv_d1_trace[] = {24, 30, 54,  84, 78, 15, 29, 50};
+  rv_t rv_d2_trace[] = {30, 54, 84,  138, 2, 77, 21, 36};
+  rv_t rv_d3_trace[] = {54, 84, 138, 222, 71, 17, 92, 33};
+  auto d1_trace = std::make_unique<scalar_t[]>(n);
+  auto d2_trace = std::make_unique<scalar_t[]>(n);
+  auto d3_trace = std::make_unique<scalar_t[]>(n);
+  to_ff(rv_d1_trace, d1_trace.get(), n);
+  to_ff(rv_d2_trace, d2_trace.get(), n);
+  to_ff(rv_d3_trace, d3_trace.get(), n);
+  // Trace: Control Columns
+  // Init steps are flagged in c1_trace
+  // Computation steps are flagged in c2_trace
+  // Termination step is flagged in c3_trace
+  // 0s at the end of each control column correspond to the padding of the trace
+  rv_t rv_c1_trace[] = {1, 0, 0, 0, 0, 0, 0, 0};
+  rv_t rv_c2_trace[] = {0, 1, 1, 1, 0, 0, 0, 0};
+  rv_t rv_c3_trace[] = {0, 0, 0, 1, 0, 0, 0, 0};
+  auto c1_trace = std::make_unique<scalar_t[]>(n);
+  auto c2_trace = std::make_unique<scalar_t[]>(n);
+  auto c3_trace = std::make_unique<scalar_t[]>(n);
+  to_ff(rv_c1_trace, c1_trace.get(), n);
+  to_ff(rv_c2_trace, c2_trace.get(), n);
+  to_ff(rv_c3_trace, c3_trace.get(), n);
+
+  std::cout << "Lesson 2: Rule checks to validate a computation" << std::endl;
+  std::cout << "We use rule-checking polynomials." << std::endl;
+
+  std::cout << "Lesson 3: Padding the Trace" << std::endl;
+  // The trace is padded to a power of 2 size to allow for efficient NTT operations.
+  // we already did this in the initialization of the trace data
+  // We will construct a zero-knowledge proof that:
+  // this trace represents a program that satisfies these 6 rules:
+  //  1) Fibonacci words here
+  //  2) d1_trace[0] == 24  (init 1 constraint)
+  //  3) d2_trace[0] == 30  (init 2 constraint)
+  //  4) d3_trace[3] == 28  (termination constraint)
+  //  5) if c2_trace[i] == 1, then d2_trace[i] == d1_trace[i+1]
+  //  6) if c2_trace[i] == 1, then d3_trace[i] == d2_trace[i+1}
+
+  std::cout << "Lesson 4: Constructing Trace Polynomials" << std::endl;
+  auto p_d1 = Polynomial_t::from_rou_evaluations(d1_trace.get(), n);
+  auto p_d2 = Polynomial_t::from_rou_evaluations(d2_trace.get(), n);
+  auto p_d3 = Polynomial_t::from_rou_evaluations(d3_trace.get(), n);
+  auto p_c1 = Polynomial_t::from_rou_evaluations(c1_trace.get(), n);
+  auto p_c2 = Polynomial_t::from_rou_evaluations(c2_trace.get(), n);
+  auto p_c3 = Polynomial_t::from_rou_evaluations(c3_trace.get(), n);
+
+  std::cout << "Lesson 5: ZK Commitments of the Trace Data" << std::endl;
+  std::cout << "To maintain a zk protocol, the trace polynomials are evaluated over a zk commitment domain" << std::endl;
+  std::cout << "zk commitment domain is a coset of Reed Solomon domain shifted by a basic root of unity" << std::endl;
+  scalar_t xzk = basic_root;
+  p_print(&p_d1, logn, xzk, "ZK commitment for d1 polynomial");   
+  std::cout << "Build Merkle Tree for ZK commitments (outside the scope of this example)" << std::endl;
+
+  std::cout << "Lesson 6: Constraint Polynomials" << std::endl;
+  std::cout << "The constraints are used to check the correctness of the trace. In this example, we check 6 rules to establish the validity of the trace." << std::endl;
+  auto p_fib_constraint =  (p_d3 - p_d2 - p_d1) * (p_c1 + p_c2 + p_c3);
+  auto fib_constraint_zkcommitment = InterpolateOnLargerDomain(&p_fib_constraint, 4*n, xzk);  
+    
+  auto p_init1_constraint = (p_d1 - p_value(scalar_t::from(24))) * p_c1;
+  // sanity checks printing
+  p_print(&p_init1_constraint, logn+2, scalar_t::one(), "Reed-Solomon constraint polynomial gives 0s in every 4th row");
+  p_print(&p_init1_constraint, logn+2, xzk, "ZK Commitment constraint polynomial gives no 0s");
+  auto p_init2_constraint = (p_d2 - p_value(scalar_t::from(30))) * p_c1;
+  auto p_termination_constraint = (p_d3 - p_value(scalar_t::from(222))) * p_c3;
+  auto p_recursion_constraint1 = (p_d1 - p_rotate(&p_d2, logn)) * p_c2;
+  auto p_recursion_constraint2 = (p_d2 - p_rotate(&p_d3, logn)) * p_c2;
+
+  std::cout << std::endl << "Lesson 7: Mixing Constraint Polynomials" << std::endl;  
+  Polynomial_t * p_all_constraints[] = {&p_fib_constraint, &p_init1_constraint, &p_init2_constraint, &p_termination_constraint, &p_recursion_constraint1, &p_recursion_constraint2};
+  const size_t nmix = sizeof(p_all_constraints) / sizeof(p_all_constraints[0]);
+  auto p_mixed_constraints = p_mix(p_all_constraints, nmix, scalar_t::from(5));
+  std::cout << "All constraint polynomials are low-degree:" << std::endl;
+  for( int i = 0; i < nmix; ++i) {
+    std::cout << i << ": " << p_all_constraints[i]->degree() << std::endl;
+  }
+
+  std::cout << "Lesson 8: The Core of the RISC Zero STARK" << std::endl;
+  std::cout << "Degree of the mixed constraints polynomial: " << p_mixed_constraints.degree() << std::endl;  
+  auto p_validity = p_mixed_constraints.divide_by_vanishing_polynomial(n);
+  std::cout << "Degree of the validity polynomial: " << p_validity.degree() << std::endl;
+  std::cout << "The Verifier should provide the Merke commitment for the above" << std::endl;
+
+  std::cout << "Lesson 9: The DEEP Technique" << std::endl;
+  std::cout << "The DEEP technique improves the security of a single query by sampling outside of the commitment domain."  << std::endl;
+  // In the original STARK protocol, the Verifier tests validity polynomial at a number of test points; 
+  // the soundness of the protocol depends on the number of tests. 
+  // The DEEP-ALI technique allows us to achieve a high degree of soundness with a single test. 
+  // The details of DEEP are described in the following lesson.
+
+  auto DEEP_point = scalar_t::from(93);
+  std::cout << "The prover convinces the verifier that V=C/Z at the DEEP_test_point, " << DEEP_point << std::endl;
+  const scalar_t coeffs1[2] = {scalar_t::zero()-DEEP_point, scalar_t::one()};
+  auto denom_DEEP1 = Polynomial_t::from_coefficients(coeffs1, 2);
+  auto [p_d1_DEEP, r] = (p_d1 - p_value(DEEP_point)).divide(denom_DEEP1);
+  std::cout << "The DEEP d1 degree is: " << p_d1_DEEP.degree() << std::endl;
+  // d2, d3 use recursion constraints and need the point corresponding to the previous state (clock cycle)
+  auto omega = scalar_t::omega(logn);
+  auto DEEP_prev_point = DEEP_point*scalar_t::inverse(omega); 
+  auto coeffs2 = std::make_unique<scalar_t[]>(2);
+  coeffs2[0] = scalar_t::zero() - DEEP_prev_point;
+  coeffs2[1] = scalar_t::one();
+  auto denom_DEEP2 = Polynomial_t::from_coefficients(coeffs2.get(), 2);
+
+  auto coeffs_d2bar = std::make_unique<scalar_t[]>(2);
+  solve_linear(DEEP_point, p_d2(DEEP_point), DEEP_prev_point, p_d2(DEEP_prev_point), coeffs_d2bar.get());
+  auto d2bar = Polynomial_t::from_coefficients(coeffs_d2bar.get(), 2);
+  auto [p_d2_DEEP, r2] = (p_d2 - d2bar).divide(denom_DEEP1*denom_DEEP2);
+  std::cout << "The DEEP d2 degree is: " << p_d2_DEEP.degree() << std::endl;
+
+  auto coeffs_d3bar = std::make_unique<scalar_t[]>(2);
+  solve_linear(DEEP_point, p_d3(DEEP_point), DEEP_prev_point, p_d3(DEEP_prev_point), coeffs_d3bar.get());
+  auto d3bar = Polynomial_t::from_coefficients(coeffs_d3bar.get(), 2);
+  auto [p_d3_DEEP, r3] = (p_d3 - d3bar).divide(denom_DEEP1*denom_DEEP2);
+  std::cout << "The DEEP d3 degree is: " << p_d3_DEEP.degree() << std::endl;
+
+  // DEEP c{1,2,3} polynomials
+  const scalar_t coeffs_c1bar[1] = {p_c1(DEEP_point)};
+  auto c1bar = Polynomial_t::from_coefficients(coeffs_c1bar, 1);
+  auto [p_c1_DEEP, r_c1] = (p_c1 - c1bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c1 degree is: " << p_c1_DEEP.degree() << std::endl;
+  const scalar_t coeffs_c2bar[1] = {p_c2(DEEP_point)};
+  auto c2bar = Polynomial_t::from_coefficients(coeffs_c2bar, 1);
+  auto [p_c2_DEEP, r_c2] = (p_c2 - c2bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c2 degree is: " << p_c2_DEEP.degree() << std::endl;
+  const scalar_t coeffs_c3bar[1] = {p_c3(DEEP_point)};
+  auto c3bar = Polynomial_t::from_coefficients(coeffs_c3bar, 1);
+  auto [p_c3_DEEP, r_c3] = (p_c3 - c3bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c3 degree is: " << p_c3_DEEP.degree() << std::endl;
+  // DEEP validity polynomial
+  const scalar_t coeffs_vbar[1] = {p_validity(DEEP_point)};
+  auto vbar = Polynomial_t::from_coefficients(coeffs_vbar, 1);
+  auto [v_DEEP, r_v] = (p_validity - vbar).divide(denom_DEEP1);
+  std::cout << "The DEEP validity polynomial degree is: " << v_DEEP.degree() << std::endl;
+  std::cout << "The Prover sends DEEP polynomials to the Verifier" << std::endl;
+
+  std::cout << "Lesson 10: Mixing (Batching) for FRI" << std::endl;
+  std::cout << "The initial FRI polynomial is the mix of the 7 DEEP polynomials." << std::endl;
+  Polynomial_t* all_DEEP[] = {&p_d1_DEEP, &p_d2_DEEP, &p_d3_DEEP, &p_c1_DEEP, &p_c2_DEEP, &p_c3_DEEP, &v_DEEP};
+  Polynomial_t fri_input = p_mix(all_DEEP, 7, scalar_t::from(99));
+  std::cout << "The degree of the mixed DEEP polynomial is: " << fri_input.degree() << std::endl;
+
+  std::cout << "Lesson 11: FRI Protocol (Commit Phase)" << std::endl;
+  std::cout << "The prover provides information to convince the verifier that the DEEP polynomials are low-degree." << std::endl;
+  int nof_rounds = 3;
+  Polynomial_t feven[nof_rounds], fodd[nof_rounds], fri[nof_rounds+1];
+  scalar_t rfri[nof_rounds];
+  fri[0] = fri_input.clone();
+  for (int i = 0; i < nof_rounds; ++i) {
+    feven[i] = fri[i].even();
+    fodd[i] = fri[i].odd();
+    rfri[i] = scalar_t::rand_host();  
+    fri[i+1] = feven[i] + rfri[i]*fodd[i];
+    std::cout << "The degree of the Round " << i << " polynomial is: " << fri[i+1].degree() << std::endl;
+  }
+
+  std::cout << "Lesson 12: FRI Protocol (Query Phase)" << std::endl;
+  // We use Polynomial API to evaluate the FRI polynomials
+  // In practice, verifier will use Merkle commitments
+  auto xp = scalar_t::rand_host();
+  auto xm = scalar_t::zero() - xp;
+  scalar_t lhs[nof_rounds], rhs[nof_rounds];
+  for (int i = 0; i < nof_rounds; ++i) {
+    rhs[i] = (rfri[i]+xp)*fri[i](xp)*scalar_t::inverse(scalar_t::from(2)*xp) + (rfri[i]+xm)*fri[i](xm)*scalar_t::inverse(scalar_t::from(2)*xm);
+    lhs[i] = fri[i+1](xp*xp);
+    std::cout << "Round " << i << std::endl << "rhs: " << rhs[i] << std::endl << "lhs: " << lhs[i] << std::endl;
+  }
+
+  return 0;
+}
--- a/examples/c++/risc0/run.sh
+++ b/examples/c++/risc0/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/golang/msm/README.md
+++ b/examples/golang/msm/README.md
@@ -0,0 +1,34 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Usage
+
+```go
+err := Msm(
+  /* Scalars input vector */ scalars,
+  /* Points input vector */ points,
+  /* MSMConfig reference */ &cfg,
+  /* Projective point result */ results)
+```
+
+In this example we use `BN254` and `BLS12377` curves. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default sizes are 2^17 - 2^22. You can change this by passing the `-l <size> -u <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> go run main.go -l=21 -u=24
+> ```
--- a/examples/golang/msm/main.go
+++ b/examples/golang/msm/main.go
@@ -0,0 +1,209 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/g2"
+	bls12377Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/msm"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	bn254Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
+)
+
+func main() {
+	var logSizeMin int
+	var logSizeMax int
+
+	flag.IntVar(&logSizeMin, "l", 17, "Minimum log size")
+	flag.IntVar(&logSizeMax, "u", 22, "Maximum log size")
+	flag.Parse()
+
+	sizeMax := 1 << logSizeMax
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254Max := bn254.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 points ... ")
+	startTime = time.Now()
+	pointsBn254Max := bn254.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 G2 points ... ")
+	startTime = time.Now()
+	pointsBn254G2Max := bn254G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377Max := bls12377.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 points ... ")
+	startTime = time.Now()
+	pointsBls12377Max := bls12377.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 G2 points ... ")
+	startTime = time.Now()
+	pointsBls12377G2Max := bls12377G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	for logSize := logSizeMin; logSize <= logSizeMax; logSize++ {
+
+		// Define the size of the problem, here 2^18.
+		size := 1 << logSize
+
+		fmt.Printf("---------------------- MSM size 2^%d=%d ------------------------\n", logSize, size)
+
+		// println(scalarsBls12377, pointsBls12377, pointsBn254G2)
+		// println(scalarsBn254, pointsBn254, pointsBls12377G2)
+
+		print("Configuring bn254 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBn254 := scalarsBn254Max[:size]
+		pointsBn254 := pointsBn254Max[:size]
+		pointsBn254G2 := pointsBn254G2Max[:size]
+
+		cfgBn254 := core.GetDefaultMSMConfig()
+		cfgBn254G2 := core.GetDefaultMSMConfig()
+		cfgBn254.IsAsync = true
+		cfgBn254G2.IsAsync = true
+
+		streamBn254, _ := cr.CreateStream()
+		streamBn254G2, _ := cr.CreateStream()
+
+		cfgBn254.Ctx.Stream = &streamBn254
+		cfgBn254G2.Ctx.Stream = &streamBn254G2
+
+		var projectiveBn254 bn254.Projective
+		var projectiveBn254G2 bn254G2.G2Projective
+
+		var msmResultBn254 core.DeviceSlice
+		var msmResultBn254G2 core.DeviceSlice
+
+		_, e := msmResultBn254.MallocAsync(projectiveBn254.Size(), projectiveBn254.Size(), streamBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBn254G2.MallocAsync(projectiveBn254G2.Size(), projectiveBn254G2.Size(), streamBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Configuring Bls12377 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBls12377 := scalarsBls12377Max[:size]
+		pointsBls12377 := pointsBls12377Max[:size]
+		pointsBls12377G2 := pointsBls12377G2Max[:size]
+
+		cfgBls12377 := core.GetDefaultMSMConfig()
+		cfgBls12377G2 := core.GetDefaultMSMConfig()
+		cfgBls12377.IsAsync = true
+		cfgBls12377G2.IsAsync = true
+
+		streamBls12377, _ := cr.CreateStream()
+		streamBls12377G2, _ := cr.CreateStream()
+
+		cfgBls12377.Ctx.Stream = &streamBls12377
+		cfgBls12377G2.Ctx.Stream = &streamBls12377G2
+
+		var projectiveBls12377 bls12377.Projective
+		var projectiveBls12377G2 bls12377G2.G2Projective
+
+		var msmResultBls12377 core.DeviceSlice
+		var msmResultBls12377G2 core.DeviceSlice
+
+		_, e = msmResultBls12377.MallocAsync(projectiveBls12377.Size(), projectiveBls12377.Size(), streamBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBls12377G2.MallocAsync(projectiveBls12377G2.Size(), projectiveBls12377G2.Size(), streamBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Executing bn254 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bn254Msm.Msm(scalarsBn254, pointsBn254, &cfgBn254, msmResultBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bn254G2.G2Msm(scalarsBn254, pointsBn254G2, &cfgBn254G2, msmResultBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBn254Host := make(core.HostSlice[bn254.Projective], 1)
+		msmResultBn254G2Host := make(core.HostSlice[bn254G2.G2Projective], 1)
+
+		msmResultBn254Host.CopyFromDeviceAsync(&msmResultBn254, streamBn254)
+		msmResultBn254G2Host.CopyFromDeviceAsync(&msmResultBn254G2, streamBn254G2)
+
+		msmResultBn254.FreeAsync(streamBn254)
+		msmResultBn254G2.FreeAsync(streamBn254G2)
+
+		cr.SynchronizeStream(&streamBn254)
+		cr.SynchronizeStream(&streamBn254G2)
+
+		println(time.Since(startTime).String())
+
+		print("Executing Bls12377 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bls12377Msm.Msm(scalarsBls12377, pointsBls12377, &cfgBls12377, msmResultBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bls12377G2.G2Msm(scalarsBls12377, pointsBls12377G2, &cfgBls12377G2, msmResultBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBls12377Host := make(core.HostSlice[bls12377.Projective], 1)
+		msmResultBls12377G2Host := make(core.HostSlice[bls12377G2.G2Projective], 1)
+
+		msmResultBls12377Host.CopyFromDeviceAsync(&msmResultBls12377, streamBls12377)
+		msmResultBls12377G2Host.CopyFromDeviceAsync(&msmResultBls12377G2, streamBls12377G2)
+
+		msmResultBls12377.FreeAsync(streamBls12377)
+		msmResultBls12377G2.FreeAsync(streamBls12377G2)
+
+		cr.SynchronizeStream(&streamBls12377)
+		cr.SynchronizeStream(&streamBls12377G2)
+
+		println(time.Since(startTime).String())
+	}
+}
--- a/examples/golang/ntt/README.md
+++ b/examples/golang/ntt/README.md
@@ -0,0 +1,39 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Golang
+
+## Key-Takeaway
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Usage
+
+```go
+err := Ntt(
+  /* input slice */ scalars,
+  /* NTT Direction */ core.KForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ result)
+```
+
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT.
+2. Generate random inputs
+3. Set up the domain.
+4. Configure NTT
+5. Execute NTT on-device
+6. Move the result on host
+
+Running the example:
+
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `-s <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+go run main.go -s=23
+```
--- a/examples/golang/ntt/main.go
+++ b/examples/golang/ntt/main.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/ntt"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+
+	bls12377Fft "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft"
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+)
+
+func main() {
+	var logSize int
+
+	flag.IntVar(&logSize, "s", 20, "Log size")
+	flag.Parse()
+
+	size := 1 << logSize
+
+	fmt.Printf("---------------------- NTT size 2^%d=%d ------------------------\n", logSize, size)
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254 := bn254.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBn254 := bn254Ntt.GetDefaultNttConfig()
+	cfgBn254.IsAsync = true
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377 := bls12377.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBls12377 := bls12377Ntt.GetDefaultNttConfig()
+	cfgBls12377.IsAsync = true
+
+	rouMontBn254, _ := bn254Fft.Generator(uint64(size))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	bn254Ntt.InitDomain(rouIcicleBn254, cfgBn254.Ctx, false)
+
+	rouMontBls12377, _ := bls12377Fft.Generator(uint64(size))
+	rouBls12377 := rouMontBls12377.Bits()
+	rouIcicleBls12377 := bls12377.ScalarField{}
+	limbsBls12377 := core.ConvertUint64ArrToUint32Arr(rouBls12377[:])
+	rouIcicleBls12377.FromLimbs(limbsBls12377)
+	bls12377Ntt.InitDomain(rouIcicleBls12377, cfgBls12377.Ctx, false)
+
+	print("Configuring bn254 NTT ... ")
+	startTime = time.Now()
+
+	streamBn254, _ := cr.CreateStream()
+
+	cfgBn254.Ctx.Stream = &streamBn254
+
+	var nttResultBn254 core.DeviceSlice
+
+	_, e := nttResultBn254.MallocAsync(size*scalarsBn254.SizeOfElement(), scalarsBn254.SizeOfElement(), streamBn254)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Configuring Bls12377 NTT ... ")
+	startTime = time.Now()
+
+	streamBls12377, _ := cr.CreateStream()
+
+	cfgBls12377.Ctx.Stream = &streamBls12377
+
+	var nttResultBls12377 core.DeviceSlice
+
+	_, e = nttResultBls12377.MallocAsync(size*scalarsBls12377.SizeOfElement(), scalarsBls12377.SizeOfElement(), streamBls12377)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bls12_377 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Executing bn254 NTT on device ... ")
+	startTime = time.Now()
+
+	err := bn254Ntt.Ntt(scalarsBn254, core.KForward, &cfgBn254, nttResultBn254)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bn254 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBn254Host := make(core.HostSlice[bn254.ScalarField], size)
+	nttResultBn254Host.CopyFromDeviceAsync(&nttResultBn254, streamBn254)
+	nttResultBn254.FreeAsync(streamBn254)
+	cr.SynchronizeStream(&streamBn254)
+	println(time.Since(startTime).String())
+
+	print("Executing Bls12377 NTT on device ... ")
+	startTime = time.Now()
+
+	err = bls12377Ntt.Ntt(scalarsBls12377, core.KForward, &cfgBls12377, nttResultBls12377)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bls12_377 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBls12377Host := make(core.HostSlice[bls12377.ScalarField], size)
+	nttResultBls12377Host.CopyFromDeviceAsync(&nttResultBls12377, streamBls12377)
+	nttResultBls12377.FreeAsync(streamBls12377)
+
+	cr.SynchronizeStream(&streamBls12377)
+
+	println(time.Since(startTime).String())
+}
--- a/examples/golang/polynomials/README.md
+++ b/examples/golang/polynomials/README.md
@@ -0,0 +1,49 @@
+# ICICLE example: Polynomials in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Polynomials](https://dev.ingonyama.com/icicle/polynomials/overview).
+
+## Usage
+### Backend Initialization
+```go
+InitPolyBackend()
+```
+### Construction
+
+```go
+poly1 := CreateFromCoeffecitients(/* Coefficients of polynomial */ coeffs)
+poly2 := CreateFromROUEvaluations(/* evaluations */ evals)
+poly3 := Clone(/* polynomial to clone */ poly1)
+```
+
+### Arithmetic
+
+```go
+polyAdd := poly1.Add(&poly2)
+polySub := poly1.Subtract(&poly2)
+polyMul := poly1.Multiply(&poly2)
+polyMulScalar := MultiplyByScalar(scalar)
+quotient, remainder := poly1.Divide(&poly2)
+```
+
+### Evaluation
+
+```go
+ev := poly1.Eval(scalar)
+ev2 := poly1.EvalOnDomain(scalars)
+```
+
+In this example we use `BN254` and `Babybear` fields. The examples shows arithmetic operations and evaluations execution.
+
+## What's in the example
+
+1. Define the size of polynomials. 
+2. Initialize backends.
+3. Generate random polynomials.
+4. Execute arithmetic operations.
+5. Execute evaluations.
+6. Execute slicing.
+
+Running the example:
+```sh
+go run main.go
+```
--- a/examples/golang/polynomials/main.go
+++ b/examples/golang/polynomials/main.go
@@ -0,0 +1,114 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+	bn254Polynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/polynomial"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	babybear "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear"
+	babybearNtt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/ntt"
+	babybearPolynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/polynomial"
+)
+
+var maxNttLogSize uint
+var polyLogSize uint
+
+func initBn254Domain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouMontBn254, _ := bn254Fft.Generator(uint64(1 << maxNttLogSize))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	return bn254Ntt.InitDomain(rouIcicleBn254, deviceCfg, false)
+}
+
+func initBabybearDomain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouIcicle := babybear.ScalarField{}
+	rouIcicle.FromUint32(1461624142)
+	return babybearNtt.InitDomain(rouIcicle, deviceCfg, false)
+}
+
+func init() {
+	flag.UintVar(&maxNttLogSize, "maxNttLogSize", 20, "")
+	flag.UintVar(&polyLogSize, "polyLogSize", 15, "")
+
+	e := initBn254Domain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Domain initialization failed: ", e)
+		panic(errorString)
+	}
+	e = initBabybearDomain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Babybear Domain initialization failed: ", e)
+		panic(errorString)
+	}
+
+	bn254Polynomial.InitPolyBackend()
+	babybearPolynomial.InitPolyBackend()
+}
+func main() {
+	polySize := 1 << polyLogSize
+
+	// randomize three polynomials over bn254 scalar field
+	var fBn254 bn254Polynomial.DensePolynomial
+	var gBn254 bn254Polynomial.DensePolynomial
+	var hBn254 bn254Polynomial.DensePolynomial
+	fBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize))
+	gBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize / 2))
+	hBn254.CreateFromROUEvaluations(bn254.GenerateScalars(polySize / 4))
+
+	// randomize two polynomials over babybear field
+	var fBabybear babybearPolynomial.DensePolynomial
+	var gBabybear babybearPolynomial.DensePolynomial
+	fBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize))
+	gBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize / 2))
+
+	// Arithmetic
+	t0 := fBn254.Add(&gBn254)
+	t1 := fBn254.Multiply(&hBn254)
+	q, r := t1.Divide(&t0)
+	rBabybear := fBabybear.Add(&gBabybear)
+	rDegree := r.Degree()
+	_ = rBabybear
+	_ = rDegree
+
+	// evaluate in single domain point
+	var five bn254.ScalarField
+	five.FromUint32(5)
+	qAtFive := q.Eval(five)
+
+	var thirty bn254.ScalarField
+	thirty.FromUint32(30)
+
+	// evaluate on domain. Note: domain and image can be either Host or Device slice.
+	// in this example domain in on host and evals on device.
+	hostDomain := core.HostSliceFromElements([]bn254.ScalarField{five, thirty})
+	var deviceImage core.DeviceSlice
+	_, err := deviceImage.Malloc(five.Size()*hostDomain.Len(), five.Size())
+	if err != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"deviceImage allocation failed: ", err)
+		panic(errorString)
+	}
+	t1.EvalOnDomain(hostDomain, deviceImage)
+
+	// slicing
+	o := hBn254.Odd()
+	e := hBn254.Even()
+
+	oddMult := o.MultiplyByScalar(qAtFive)
+	fold := e.Add(&oddMult) // e(x) + o(x)*scalar
+
+	coeff := fold.GetCoeff(2) // coeff of x^2
+	_ = coeff
+}
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -2,7 +2,8 @@ use icicle_bls12_381::curve::ScalarField as F;

 use icicle_cuda_runtime::device_context::DeviceContext;

-use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
+use icicle_core::hash::{SpongeHash, HashConfig};
+use icicle_core::poseidon::Poseidon;
 use icicle_core::traits::FieldImpl;
 use icicle_cuda_runtime::memory::HostSlice;

@@ -24,14 +25,14 @@ fn main() {
    let test_size = 1 << size;

    println!("Running Icicle Examples: Rust Poseidon Hash");
-    let arity = 2u32;
+    let arity = 2;
    println!(
        "---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
        arity
    );
    let ctx = DeviceContext::default();
-    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
-    let config = PoseidonConfig::default();
+    let poseidon = Poseidon::load(arity, &ctx).unwrap();
+    let config = HashConfig::default();

    println!(
        "---------------------- Input size 2^{}={} ------------------------",
@@ -45,12 +46,12 @@ fn main() {
    println!("Executing BLS12-381 Poseidon Hash on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    poseidon_hash_many::<F>(
+    poseidon.hash_many(
        input_slice,
        output_slice,
-        test_size as u32,
-        arity as u32,
-        &constants,
+        test_size,
+        arity,
+        1,
        &config,
    )
    .unwrap();
--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -14,51 +14,42 @@ endfunction()
 function(set_gpu_env)
    # add the target cuda architectures
    # each additional architecture increases the compilation time and output file size
-    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    if(DEFINED CUDA_ARCH) # user defined arch takes priority
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
    else()
-    find_program(_nvidia_smi "nvidia-smi")
+        find_program(_nvidia_smi "nvidia-smi")

-    if(_nvidia_smi)
-        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+        if(_nvidia_smi)
+            execute_process(
+                COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
+                OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            # Process the output to form the CUDA architectures string
+            string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")

-        # execute nvidia-smi -L to get a short list of GPUs available
-        exec_program(${_nvidia_smi_path} ARGS -L
-        OUTPUT_VARIABLE _nvidia_smi_out
-        RETURN_VALUE _nvidia_smi_ret)
+            set(CUDA_ARCHITECTURES "")
+            foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
+                # Remove the dot in compute capability to match CMake format
+                string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
+                if(CUDA_ARCHITECTURES)
+                    set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
+                else()
+                    set(CUDA_ARCHITECTURES "${CAPABILITY}")
+                endif()
+            endforeach()

-        # process the stdout of nvidia-smi
-        if(_nvidia_smi_ret EQUAL 0)
-        # convert string with newlines to list of strings
-        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-        foreach(_line ${_nvidia_smi_out})
-            if(_line MATCHES "^GPU [0-9]+:")
-            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-            # the UUID is not very useful for the user, remove it
-            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-            if(NOT _gpu_info STREQUAL "")
-                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-            endif()
-            endif()
-        endforeach()
-
-        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")        
+            set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)                        
+        else()
+            # no GPUs found, like on Github CI runners
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: 50") 
+            set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
        endif()
    endif()

-    # ##
-    if(DETECT_GPU_COUNT GREATER 0)
-        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
-    else()
-        # no GPUs found, like on Github CI runners
-        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
-    endif()
-    endif()
-
    # Check CUDA version and, if possible, enable multi-threaded compilation 
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
        message(STATUS "Using multi-threaded CUDA compilation.")
@@ -69,4 +60,4 @@ function(set_gpu_env)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
-endfunction()
+endfunction()
--- a/icicle/cmake/FieldsCommon.cmake
+++ b/icicle/cmake/FieldsCommon.cmake
@@ -1,5 +1,5 @@
 function(check_field)
-  set(SUPPORTED_FIELDS babybear;stark252)
+  set(SUPPORTED_FIELDS babybear;stark252;m31)

  set(IS_FIELD_SUPPORTED FALSE)
  set(I 1000)
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -9,19 +9,77 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "fields/stark_fields/babybear.cuh"
 #include "ntt/ntt.cuh"
 #include "vec_ops/vec_ops.cuh"
+#include "poseidon2/poseidon2.cuh"

 extern "C" cudaError_t babybear_extension_ntt_cuda(
  const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);

+extern "C" cudaError_t babybear_poseidon2_create_cuda(
+  poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
+  const babybear::scalar_t* round_constants,
+  const babybear::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);
+
+extern "C" cudaError_t babybear_poseidon2_load_cuda(
+  poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);
+
+extern "C" cudaError_t babybear_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<babybear::scalar_t>* poseidon,
+  const babybear::scalar_t* inputs,
+  babybear::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  babybear_poseidon2_delete_cuda(poseidon2::Poseidon2<babybear::scalar_t>* poseidon, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t babybear_build_merkle_tree(
+  const babybear::scalar_t* leaves,
+  babybear::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
+  const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t babybear_mmcs_commit_cuda(
+    const matrix::Matrix<babybear::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    babybear::scalar_t* digests,
+    const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* hasher,
+    const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
 extern "C" cudaError_t babybear_mul_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

 extern "C" cudaError_t babybear_add_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

+extern "C" cudaError_t babybear_accumulate_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t babybear_sub_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

@@ -34,6 +92,10 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t babybear_bit_reverse_cuda(
+  const babybear::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output);
+
+
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);

 extern "C" cudaError_t babybear_scalar_convert_montgomery(
@@ -58,6 +120,9 @@ extern "C" cudaError_t babybear_extension_mul_cuda(
 extern "C" cudaError_t babybear_extension_add_cuda(
  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);

+extern "C" cudaError_t babybear_extension_accumulate_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t babybear_extension_sub_cuda(
  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);

@@ -70,4 +135,8 @@ extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
+  const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);
+
+
 #endif
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -9,20 +9,18 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bls12_377.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"

 extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
  bls12_377::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_377_g2_msm_cuda(
@@ -30,11 +28,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(

 extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
  bls12_377::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::affine_t* output_bases);

 extern "C" cudaError_t bls12_377_msm_cuda(
@@ -71,32 +66,52 @@ extern "C" cudaError_t bls12_377_affine_convert_montgomery(
 extern "C" cudaError_t bls12_377_projective_convert_montgomery(
  bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bls12_377::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bls12_377::scalar_t>* poseidon_constants);
-
-extern "C" cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_377::scalar_t>* constants);
-
-extern "C" cudaError_t bls12_377_poseidon_hash_cuda(
-  bls12_377::scalar_t* input,
-  bls12_377::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree(
+extern "C" cudaError_t bls12_377_build_merkle_tree(
  const bls12_377::scalar_t* leaves,
  bls12_377::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+  const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t bls12_377_mmcs_commit_cuda(
+    const matrix::Matrix<bls12_377::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bls12_377::scalar_t* digests,
+    const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
+    const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t bls12_377_poseidon_create_cuda(
+  poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bls12_377::scalar_t* round_constants,
+  const bls12_377::scalar_t* mds_matrix,
+  const bls12_377::scalar_t* non_sparse_matrix,
+  const bls12_377::scalar_t* sparse_matrices,
+  const bls12_377::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_poseidon_load_cuda(
+  poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bls12_377::scalar_t>* poseidon,
+  const bls12_377::scalar_t* inputs,
+  bls12_377::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  bls12_377_poseidon_delete_cuda(poseidon::Poseidon<bls12_377::scalar_t>* poseidon);

 extern "C" cudaError_t bls12_377_mul_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
@@ -104,6 +119,9 @@ extern "C" cudaError_t bls12_377_mul_cuda(
 extern "C" cudaError_t bls12_377_add_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

+extern "C" cudaError_t bls12_377_accumulate_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_377_sub_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

@@ -116,6 +134,10 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_377_bit_reverse_cuda(
+  const bls12_377::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output);
+
+
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -9,20 +9,18 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bls12_381.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"

 extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
  bls12_381::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_381_g2_msm_cuda(
@@ -30,11 +28,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(

 extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
  bls12_381::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::affine_t* output_bases);

 extern "C" cudaError_t bls12_381_msm_cuda(
@@ -71,32 +66,52 @@ extern "C" cudaError_t bls12_381_affine_convert_montgomery(
 extern "C" cudaError_t bls12_381_projective_convert_montgomery(
  bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bls12_381::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bls12_381::scalar_t>* poseidon_constants);
-
-extern "C" cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_381::scalar_t>* constants);
-
-extern "C" cudaError_t bls12_381_poseidon_hash_cuda(
-  bls12_381::scalar_t* input,
-  bls12_381::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree(
+extern "C" cudaError_t bls12_381_build_merkle_tree(
  const bls12_381::scalar_t* leaves,
  bls12_381::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+  const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t bls12_381_mmcs_commit_cuda(
+    const matrix::Matrix<bls12_381::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bls12_381::scalar_t* digests,
+    const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
+    const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t bls12_381_poseidon_create_cuda(
+  poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bls12_381::scalar_t* round_constants,
+  const bls12_381::scalar_t* mds_matrix,
+  const bls12_381::scalar_t* non_sparse_matrix,
+  const bls12_381::scalar_t* sparse_matrices,
+  const bls12_381::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_poseidon_load_cuda(
+  poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bls12_381::scalar_t>* poseidon,
+  const bls12_381::scalar_t* inputs,
+  bls12_381::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  bls12_381_poseidon_delete_cuda(poseidon::Poseidon<bls12_381::scalar_t>* poseidon);

 extern "C" cudaError_t bls12_381_mul_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
@@ -104,6 +119,9 @@ extern "C" cudaError_t bls12_381_mul_cuda(
 extern "C" cudaError_t bls12_381_add_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

+extern "C" cudaError_t bls12_381_accumulate_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_381_sub_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

@@ -116,6 +134,10 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_381_bit_reverse_cuda(
+  const bls12_381::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output);
+
+
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -9,20 +9,19 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bn254.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
+#include "poseidon2/poseidon2.cuh"

 extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
  bn254::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::g2_affine_t* output_bases);

 extern "C" cudaError_t bn254_g2_msm_cuda(
@@ -30,11 +29,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(

 extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
  bn254::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::affine_t* output_bases);

 extern "C" cudaError_t bn254_msm_cuda(
@@ -71,32 +67,87 @@ extern "C" cudaError_t bn254_affine_convert_montgomery(
 extern "C" cudaError_t bn254_projective_convert_montgomery(
  bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bn254::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bn254::scalar_t>* poseidon_constants);
+extern "C" cudaError_t bn254_poseidon2_create_cuda(
+  poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
+  const bn254::scalar_t* round_constants,
+  const bn254::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t bn254_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bn254::scalar_t>* constants);
+extern "C" cudaError_t bn254_poseidon2_load_cuda(
+  poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t bn254_poseidon_hash_cuda(
-  bn254::scalar_t* input,
+extern "C" cudaError_t bn254_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<bn254::scalar_t>* poseidon,
+  const bn254::scalar_t* inputs,
  bn254::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bn254::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);

-extern "C" cudaError_t bn254_build_poseidon_merkle_tree(
+extern "C" cudaError_t
+  bn254_poseidon2_delete_cuda(poseidon2::Poseidon2<bn254::scalar_t>* poseidon, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_build_merkle_tree(
  const bn254::scalar_t* leaves,
  bn254::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bn254::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
+  const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t bn254_mmcs_commit_cuda(
+    const matrix::Matrix<bn254::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bn254::scalar_t* digests,
+    const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* hasher,
+    const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t bn254_poseidon_create_cuda(
+  poseidon::Poseidon<bn254::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bn254::scalar_t* round_constants,
+  const bn254::scalar_t* mds_matrix,
+  const bn254::scalar_t* non_sparse_matrix,
+  const bn254::scalar_t* sparse_matrices,
+  const bn254::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_poseidon_load_cuda(
+  poseidon::Poseidon<bn254::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bn254::scalar_t>* poseidon,
+  const bn254::scalar_t* inputs,
+  bn254::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  bn254_poseidon_delete_cuda(poseidon::Poseidon<bn254::scalar_t>* poseidon);

 extern "C" cudaError_t bn254_mul_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
@@ -104,6 +155,9 @@ extern "C" cudaError_t bn254_mul_cuda(
 extern "C" cudaError_t bn254_add_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

+extern "C" cudaError_t bn254_accumulate_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bn254_sub_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

@@ -116,6 +170,10 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bn254_bit_reverse_cuda(
+  const bn254::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output);
+
+
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);

 extern "C" cudaError_t bn254_scalar_convert_montgomery(
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -9,20 +9,18 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bw6_761.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"

 extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
  bw6_761::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::g2_affine_t* output_bases);

 extern "C" cudaError_t bw6_761_g2_msm_cuda(
@@ -30,11 +28,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(

 extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
  bw6_761::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::affine_t* output_bases);

 extern "C" cudaError_t bw6_761_msm_cuda(
@@ -71,32 +66,52 @@ extern "C" cudaError_t bw6_761_affine_convert_montgomery(
 extern "C" cudaError_t bw6_761_projective_convert_montgomery(
  bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bw6_761::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bw6_761::scalar_t>* poseidon_constants);
-
-extern "C" cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bw6_761::scalar_t>* constants);
-
-extern "C" cudaError_t bw6_761_poseidon_hash_cuda(
-  bw6_761::scalar_t* input,
-  bw6_761::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree(
+extern "C" cudaError_t bw6_761_build_merkle_tree(
  const bw6_761::scalar_t* leaves,
  bw6_761::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+  const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t bw6_761_mmcs_commit_cuda(
+    const matrix::Matrix<bw6_761::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bw6_761::scalar_t* digests,
+    const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
+    const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t bw6_761_poseidon_create_cuda(
+  poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bw6_761::scalar_t* round_constants,
+  const bw6_761::scalar_t* mds_matrix,
+  const bw6_761::scalar_t* non_sparse_matrix,
+  const bw6_761::scalar_t* sparse_matrices,
+  const bw6_761::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_poseidon_load_cuda(
+  poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bw6_761::scalar_t>* poseidon,
+  const bw6_761::scalar_t* inputs,
+  bw6_761::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  bw6_761_poseidon_delete_cuda(poseidon::Poseidon<bw6_761::scalar_t>* poseidon);

 extern "C" cudaError_t bw6_761_mul_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
@@ -104,6 +119,9 @@ extern "C" cudaError_t bw6_761_mul_cuda(
 extern "C" cudaError_t bw6_761_add_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

+extern "C" cudaError_t bw6_761_accumulate_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bw6_761_sub_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

@@ -116,6 +134,10 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bw6_761_bit_reverse_cuda(
+  const bw6_761::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output);
+
+
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);

 extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -9,19 +9,17 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/grumpkin.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"

 extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
  grumpkin::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  grumpkin::affine_t* output_bases);

 extern "C" cudaError_t grumpkin_msm_cuda(
@@ -41,32 +39,52 @@ extern "C" cudaError_t grumpkin_affine_convert_montgomery(
 extern "C" cudaError_t grumpkin_projective_convert_montgomery(
  grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const grumpkin::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<grumpkin::scalar_t>* poseidon_constants);
-
-extern "C" cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<grumpkin::scalar_t>* constants);
-
-extern "C" cudaError_t grumpkin_poseidon_hash_cuda(
-  grumpkin::scalar_t* input,
-  grumpkin::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree(
+extern "C" cudaError_t grumpkin_build_merkle_tree(
  const grumpkin::scalar_t* leaves,
  grumpkin::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+  const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t grumpkin_mmcs_commit_cuda(
+    const matrix::Matrix<grumpkin::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    grumpkin::scalar_t* digests,
+    const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
+    const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t grumpkin_poseidon_create_cuda(
+  poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const grumpkin::scalar_t* round_constants,
+  const grumpkin::scalar_t* mds_matrix,
+  const grumpkin::scalar_t* non_sparse_matrix,
+  const grumpkin::scalar_t* sparse_matrices,
+  const grumpkin::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_poseidon_load_cuda(
+  poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<grumpkin::scalar_t>* poseidon,
+  const grumpkin::scalar_t* inputs,
+  grumpkin::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  grumpkin_poseidon_delete_cuda(poseidon::Poseidon<grumpkin::scalar_t>* poseidon);

 extern "C" cudaError_t grumpkin_mul_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
@@ -74,6 +92,9 @@ extern "C" cudaError_t grumpkin_mul_cuda(
 extern "C" cudaError_t grumpkin_add_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

+extern "C" cudaError_t grumpkin_accumulate_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t grumpkin_sub_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

@@ -86,6 +107,10 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t grumpkin_bit_reverse_cuda(
+  const grumpkin::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output);
+
+
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);

 extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
--- a/icicle/include/api/hash.h
+++ b/icicle/include/api/hash.h
@@ -6,11 +6,25 @@
 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
 #include "hash/keccak/keccak.cuh"
+#include "merkle-tree/merkle.cuh"

 extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);

 extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);

+extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
+  const uint8_t* leaves,
+  uint64_t* digests,
+  unsigned int height,
+  unsigned int input_block_len,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t build_keccak512_merkle_tree_cuda(
+  const uint8_t* leaves,
+  uint64_t* digests,
+  unsigned int height,
+  unsigned int input_block_len,
+  const merkle_tree::TreeBuilderConfig& tree_config);
 #endif
--- a/icicle/include/api/m31.h
+++ b/icicle/include/api/m31.h
@@ -0,0 +1,94 @@
+// WARNING: This file is auto-generated by a script.
+// Any changes made to this file may be overwritten.
+// Please modify the code generation script instead.
+// Path to the code generation script: scripts/gen_c_api.py
+
+#pragma once
+#ifndef M31_API_H
+#define M31_API_H
+
+#include <cuda_runtime.h>
+#include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
+#include "fields/stark_fields/m31.cuh"
+#include "vec_ops/vec_ops.cuh"
+
+extern "C" cudaError_t m31_build_merkle_tree(
+  const m31::scalar_t* leaves,
+  m31::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
+  const hash::Hasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t m31_mmcs_commit_cuda(
+    const matrix::Matrix<m31::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    m31::scalar_t* digests,
+    const hash::Hasher<m31::scalar_t, m31::scalar_t>* hasher,
+    const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t m31_mul_cuda(
+  m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
+
+extern "C" cudaError_t m31_add_cuda(
+  m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
+
+extern "C" cudaError_t m31_accumulate_cuda(
+  m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
+extern "C" cudaError_t m31_sub_cuda(
+  m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
+
+extern "C" cudaError_t m31_transpose_matrix_cuda(
+  const m31::scalar_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  m31::scalar_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" cudaError_t m31_bit_reverse_cuda(
+  const m31::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::scalar_t* output);
+
+
+extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
+
+extern "C" cudaError_t m31_scalar_convert_montgomery(
+  m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
+
+extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
+  m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t m31_extension_mul_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_add_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_accumulate_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
+extern "C" cudaError_t m31_extension_sub_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
+  const m31::extension_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  m31::extension_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" cudaError_t m31_extension_bit_reverse_cuda(
+  const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
+
+
+#endif
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -9,16 +9,38 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "fields/stark_fields/stark252.cuh"
 #include "ntt/ntt.cuh"
 #include "vec_ops/vec_ops.cuh"

+extern "C" cudaError_t stark252_build_merkle_tree(
+  const stark252::scalar_t* leaves,
+  stark252::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
+  const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t stark252_mmcs_commit_cuda(
+    const matrix::Matrix<stark252::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    stark252::scalar_t* digests,
+    const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* hasher,
+    const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
 extern "C" cudaError_t stark252_mul_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

 extern "C" cudaError_t stark252_add_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

+extern "C" cudaError_t stark252_accumulate_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t stark252_sub_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

@@ -31,6 +53,10 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t stark252_bit_reverse_cuda(
+  const stark252::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output);
+
+
 extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);

 extern "C" cudaError_t stark252_scalar_convert_montgomery(
--- a/icicle/include/api/templates/curves/msm.h
+++ b/icicle/include/api/templates/curves/msm.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
  ${CURVE}::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_msm_cuda(
--- a/icicle/include/api/templates/curves/msm_g2.h
+++ b/icicle/include/api/templates/curves/msm_g2.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
  ${CURVE}::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::g2_affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_g2_msm_cuda(
--- a/icicle/include/api/templates/fields/poseidon.h
+++ b/icicle/include/api/templates/fields/poseidon.h
@@ -1,26 +1,29 @@
-extern "C" cudaError_t ${FIELD}_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const ${FIELD}::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<${FIELD}::scalar_t>* poseidon_constants);
+extern "C" cudaError_t ${FIELD}_poseidon_create_cuda(
+  poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const ${FIELD}::scalar_t* round_constants,
+  const ${FIELD}::scalar_t* mds_matrix,
+  const ${FIELD}::scalar_t* non_sparse_matrix,
+  const ${FIELD}::scalar_t* sparse_matrices,
+  const ${FIELD}::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t ${FIELD}_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<${FIELD}::scalar_t>* constants);
+extern "C" cudaError_t ${FIELD}_poseidon_load_cuda(
+  poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t ${FIELD}_poseidon_hash_cuda(
-  ${FIELD}::scalar_t* input,
+extern "C" cudaError_t ${FIELD}_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<${FIELD}::scalar_t>* poseidon,
+  const ${FIELD}::scalar_t* inputs,
  ${FIELD}::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);

-extern "C" cudaError_t ${FIELD}_build_poseidon_merkle_tree(
-  const ${FIELD}::scalar_t* leaves,
-  ${FIELD}::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t
+  ${FIELD}_poseidon_delete_cuda(poseidon::Poseidon<${FIELD}::scalar_t>* poseidon);
--- a/icicle/include/api/templates/fields/poseidon2.h
+++ b/icicle/include/api/templates/fields/poseidon2.h
@@ -0,0 +1,34 @@
+extern "C" cudaError_t ${FIELD}_poseidon2_create_cuda(
+  poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
+  const ${FIELD}::scalar_t* round_constants,
+  const ${FIELD}::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);
+
+extern "C" cudaError_t ${FIELD}_poseidon2_load_cuda(
+  poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);
+
+extern "C" cudaError_t ${FIELD}_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon,
+  const ${FIELD}::scalar_t* inputs,
+  ${FIELD}::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::HashConfig& cfg);
+
+extern "C" cudaError_t
+  ${FIELD}_poseidon2_delete_cuda(poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon, device_context::DeviceContext& ctx);
--- a/icicle/include/api/templates/fields/tree.h
+++ b/icicle/include/api/templates/fields/tree.h
@@ -0,0 +1,16 @@
+extern "C" cudaError_t ${FIELD}_build_merkle_tree(
+  const ${FIELD}::scalar_t* leaves,
+  ${FIELD}::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+  const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t ${FIELD}_mmcs_commit_cuda(
+    const matrix::Matrix<${FIELD}::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    ${FIELD}::scalar_t* digests,
+    const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
+    const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
--- a/icicle/include/api/templates/fields/vec_ops.h
+++ b/icicle/include/api/templates/fields/vec_ops.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda(
 extern "C" cudaError_t ${FIELD}_add_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

+extern "C" cudaError_t ${FIELD}_accumulate_cuda(
+  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_sub_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

@@ -14,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_transpose_matrix_cuda(
  ${FIELD}::scalar_t* output,
  device_context::DeviceContext& ctx,
  bool on_device,
-  bool is_async);
+  bool is_async);
+
+extern "C" cudaError_t ${FIELD}_bit_reverse_cuda(
+  const ${FIELD}::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::scalar_t* output);
--- a/icicle/include/api/templates/fields/vec_ops_ext.h
+++ b/icicle/include/api/templates/fields/vec_ops_ext.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda(
 extern "C" cudaError_t ${FIELD}_extension_add_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

+extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda(
+  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_extension_sub_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

@@ -14,4 +17,7 @@ extern "C" cudaError_t ${FIELD}_extension_transpose_matrix_cuda(
  ${FIELD}::extension_t* output,
  device_context::DeviceContext& ctx,
  bool on_device,
-  bool is_async);
+  bool is_async);
+
+extern "C" cudaError_t ${FIELD}_extension_bit_reverse_cuda(
+  const ${FIELD}::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, ${FIELD}::extension_t* output);
--- a/icicle/include/curves/affine.cuh
+++ b/icicle/include/curves/affine.cuh
@@ -1,7 +1,7 @@
 #pragma once

-#include "../gpu-utils/sharedmem.cuh"
-#include "../gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
+#include "gpu-utils/modifiers.cuh"
 #include <iostream>

 template <class FF>
@@ -11,26 +11,26 @@ public:
  FF x;
  FF y;

-  static Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }
+  static HOST_DEVICE_INLINE Affine neg(const Affine& point) { return {point.x, FF::neg(point.y)}; }

-  static Affine zero() { return {FF::zero(), FF::zero()}; }
+  static HOST_DEVICE_INLINE Affine zero() { return {FF::zero(), FF::zero()}; }

-  static Affine to_montgomery(const Affine& point)
+  static HOST_DEVICE_INLINE Affine to_montgomery(const Affine& point)
  {
    return {FF::to_montgomery(point.x), FF::to_montgomery(point.y)};
  }

-  static Affine from_montgomery(const Affine& point)
+  static HOST_DEVICE_INLINE Affine from_montgomery(const Affine& point)
  {
    return {FF::from_montgomery(point.x), FF::from_montgomery(point.y)};
  }

-  friend bool operator==(const Affine& xs, const Affine& ys)
+  friend HOST_DEVICE_INLINE bool operator==(const Affine& xs, const Affine& ys)
  {
    return (xs.x == ys.x) && (xs.y == ys.y);
  }

-  friend std::ostream& operator<<(std::ostream& os, const Affine& point)
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Affine& point)
  {
    os << "x: " << point.x << "; y: " << point.y;
    return os;
@@ -39,9 +39,9 @@ public:

 template <class FF>
 struct SharedMemory<Affine<FF>> {
-  Affine<FF>* getPointer()
+  __device__ Affine<FF>* getPointer()
  {
-    Affine<FF> *s_affine_ = nullptr;
+    extern __shared__ Affine<FF> s_affine_[];
    return s_affine_;
  }
 };
--- a/icicle/include/curves/curve_config.cuh
+++ b/icicle/include/curves/curve_config.cuh
@@ -1,9 +1,9 @@
- #pragma once
+#pragma once
 #ifndef CURVE_CONFIG_H
 #define CURVE_CONFIG_H

-#include "../fields/id.h"
-#include "projective.cuh"
+#include "fields/id.h"
+#include "curves/projective.cuh"

 /**
 * @namespace curve_config
@@ -12,23 +12,23 @@
 * with the `-DCURVE` env variable passed during build.
 */
 #if CURVE_ID == BN254
-#include "params/bn254.cuh"
+#include "curves/params/bn254.cuh"
 namespace curve_config = bn254;

 #elif CURVE_ID == BLS12_381
-#include "params/bls12_381.cuh"
+#include "curves/params/bls12_381.cuh"
 namespace curve_config = bls12_381;

 #elif CURVE_ID == BLS12_377
-#include "params/bls12_377.cuh"
+#include "curves/params/bls12_377.cuh"
 namespace curve_config = bls12_377;

 #elif CURVE_ID == BW6_761
-#include "params/bw6_761.cuh"
+#include "curves/params/bw6_761.cuh"
 namespace curve_config = bw6_761;

 #elif CURVE_ID == GRUMPKIN
-#include "params/grumpkin.cuh"
+#include "curves/params/grumpkin.cuh"
 namespace curve_config = grumpkin;
 #endif
 #endif
--- a/icicle/include/curves/macro.h
+++ b/icicle/include/curves/macro.h
@@ -22,7 +22,7 @@
  typedef Affine<point_field_t> affine_t;

 #define G2_CURVE_DEFINITIONS \
-  typedef ExtensionField<fq_config> g2_point_field_t; \
+  typedef ExtensionField<fq_config, point_field_t> g2_point_field_t; \
  static constexpr g2_point_field_t g2_generator_x = \
    g2_point_field_t{point_field_t{g2_gen_x_re}, point_field_t{g2_gen_x_im}}; \
  static constexpr g2_point_field_t g2_generator_y = \
--- a/icicle/include/curves/params/bn254.cuh
+++ b/icicle/include/curves/params/bn254.cuh
@@ -2,13 +2,13 @@
 #ifndef BN254_PARAMS_H
 #define BN254_PARAMS_H

-#include "../../fields/storage.cuh"
+#include "fields/storage.cuh"

-#include "../macro.h"
-#include "../projective.cuh"
-#include "../../fields/snark_fields/bn254_base.cuh"
-#include "../../fields/snark_fields/bn254_scalar.cuh"
-#include "../../fields/quadratic_extension.cuh"
+#include "curves/macro.h"
+#include "curves/projective.cuh"
+#include "fields/snark_fields/bn254_base.cuh"
+#include "fields/snark_fields/bn254_scalar.cuh"
+#include "fields/quadratic_extension.cuh"

 namespace bn254 {
  // G1 and G2 generators
--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -1,7 +1,7 @@
 #pragma once

 #include "affine.cuh"
-#include "../gpu-utils/sharedmem.cuh"
+#include "gpu-utils/sharedmem.cuh"

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 class Projective
@@ -19,34 +19,34 @@ public:
  FF y;
  FF z;

-  static Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }
+  static HOST_DEVICE_INLINE Projective zero() { return {FF::zero(), FF::one(), FF::zero()}; }

-  static Affine<FF> to_affine(const Projective& point)
+  static HOST_DEVICE_INLINE Affine<FF> to_affine(const Projective& point)
  {
    FF denom = FF::inverse(point.z);
    return {point.x * denom, point.y * denom};
  }

-  static Projective from_affine(const Affine<FF>& point)
+  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
  {
    return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
  }

-  static Projective to_montgomery(const Projective& point)
+  static HOST_DEVICE_INLINE Projective to_montgomery(const Projective& point)
  {
    return {FF::to_montgomery(point.x), FF::to_montgomery(point.y), FF::to_montgomery(point.z)};
  }

-  static Projective from_montgomery(const Projective& point)
+  static HOST_DEVICE_INLINE Projective from_montgomery(const Projective& point)
  {
    return {FF::from_montgomery(point.x), FF::from_montgomery(point.y), FF::from_montgomery(point.z)};
  }

-  static Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }
+  static HOST_DEVICE_INLINE Projective generator() { return {GENERATOR_X, GENERATOR_Y, FF::one()}; }

-  static Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }
+  static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }

-  static Projective dbl(const Projective& point)
+  static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
  {
    const FF X = point.x;
    const FF Y = point.y;
@@ -74,7 +74,7 @@ public:
    return {X3, Y3, Z3};
  }

-  friend Projective operator+(Projective p1, const Projective& p2)
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
  {
    const FF X1 = p1.x;                                                                //                   < 2
    const FF Y1 = p1.y;                                                                //                   < 2
@@ -118,9 +118,9 @@ public:
    return {X3, Y3, Z3};
  }

-  friend Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Projective& p2) { return p1 + neg(p2); }

-  friend Projective operator+(Projective p1, const Affine<FF>& p2)
+  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2)
  {
    const FF X1 = p1.x;                                                                //                   < 2
    const FF Y1 = p1.y;                                                                //                   < 2
@@ -163,45 +163,45 @@ public:
    return {X3, Y3, Z3};
  }

-  friend Projective operator-(Projective p1, const Affine<FF>& p2)
+  friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2)
  {
    return p1 + Affine<FF>::neg(p2);
  }

-  friend Projective operator*(SCALAR_FF scalar, const Projective& point)
+  friend HOST_DEVICE_INLINE Projective operator*(SCALAR_FF scalar, const Projective& point)
  {
    Projective res = zero();
 #ifdef __CUDA_ARCH__
    UNROLL
 #endif
    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
-      if (i > 0) { res = res + res; }
+      if (i > 0) { res = dbl(res); }
      if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; }
    }
    return res;
  }

-  friend Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }
+  friend HOST_DEVICE_INLINE Projective operator*(const Projective& point, SCALAR_FF scalar) { return scalar * point; }

-  friend bool operator==(const Projective& p1, const Projective& p2)
+  friend HOST_DEVICE_INLINE bool operator==(const Projective& p1, const Projective& p2)
  {
    return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
  }

-  friend bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }
+  friend HOST_DEVICE_INLINE bool operator!=(const Projective& p1, const Projective& p2) { return !(p1 == p2); }

-  friend std::ostream& operator<<(std::ostream& os, const Projective& point)
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point)
  {
    os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
    return os;
  }

-  static bool is_zero(const Projective& point)
+  static HOST_DEVICE_INLINE bool is_zero(const Projective& point)
  {
    return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
  }

-  static bool is_on_curve(const Projective& point)
+  static HOST_DEVICE_INLINE bool is_on_curve(const Projective& point)
  {
    if (is_zero(point)) return true;
    bool eq_holds =
@@ -210,7 +210,7 @@ public:
    return point.z != FF::zero() && eq_holds;
  }

-  static Projective rand_host()
+  static HOST_INLINE Projective rand_host()
  {
    SCALAR_FF rand_scalar = SCALAR_FF::rand_host();
    return rand_scalar * generator();
@@ -231,9 +231,9 @@ public:

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
-  Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
+  __device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
  {
-    Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> *s_projective_ = nullptr;
+    extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
    return s_projective_;
  }
 };
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -18,9 +18,9 @@

 #pragma once

-#include "../gpu-utils/error_handler.cuh"
-#include "../gpu-utils/modifiers.cuh"
-#include "../gpu-utils/sharedmem.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
 #include "host_math.cuh"
 #include "ptx.cuh"
 #include "storage.cuh"
@@ -38,13 +38,13 @@ public:
  static constexpr unsigned TLC = CONFIG::limbs_count;
  static constexpr unsigned NBITS = CONFIG::modulus_bit_count;

-  static constexpr Field zero() { return Field{CONFIG::zero}; }
+  static constexpr HOST_DEVICE_INLINE Field zero() { return Field{CONFIG::zero}; }

-  static constexpr Field one() { return Field{CONFIG::one}; }
+  static constexpr HOST_DEVICE_INLINE Field one() { return Field{CONFIG::one}; }

-  static constexpr Field from(uint32_t value)
+  static constexpr HOST_DEVICE_INLINE Field from(uint32_t value)
  {
-    storage<TLC> scalar;
+    storage<TLC> scalar{};
    scalar.limbs[0] = value;
    for (int i = 1; i < TLC; i++) {
      scalar.limbs[i] = 0;
@@ -52,17 +52,19 @@ public:
    return Field{scalar};
  }

-  static Field omega(uint32_t logn)
+  static HOST_INLINE Field omega(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }

    if (logn > CONFIG::omegas_count) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Field: Invalid omega index"); }

-    storage_array<CONFIG::omegas_count, TLC> const omega = CONFIG::omega;
-    return Field{omega.storages[logn - 1]};
+    Field omega = Field{CONFIG::rou};
+    for (int i = 0; i < CONFIG::omegas_count - logn; i++)
+      omega = sqr(omega);
+    return omega;
  }

-  static Field omega_inv(uint32_t logn)
+  static HOST_INLINE Field omega_inv(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }

@@ -70,11 +72,13 @@ public:
      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Field: Invalid omega_inv index");
    }

-    storage_array<CONFIG::omegas_count, TLC> const omega_inv = CONFIG::omega_inv;
-    return Field{omega_inv.storages[logn - 1]};
+    Field omega = inverse(Field{CONFIG::rou});
+    for (int i = 0; i < CONFIG::omegas_count - logn; i++)
+      omega = sqr(omega);
+    return omega;
  }

-  static Field inv_log_size(uint32_t logn)
+  static HOST_DEVICE_INLINE Field inv_log_size(uint32_t logn)
  {
    if (logn == 0) { return Field{CONFIG::one}; }
 #ifndef __CUDA_ARCH__
@@ -91,7 +95,7 @@ public:
    return Field{inv.storages[logn - 1]};
  }

-  static constexpr unsigned get_omegas_count()
+  static constexpr HOST_INLINE unsigned get_omegas_count()
  {
    if constexpr (has_member_omegas_count<CONFIG>()) {
      return CONFIG::omegas_count;
@@ -113,45 +117,56 @@ public:
  /**
   * A new addition to the config file - \f$ 2^{32 \cdot num\_limbs} - p \f$.
   */
-  static constexpr ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }
+  static constexpr HOST_DEVICE_INLINE ff_storage get_neg_modulus() { return CONFIG::neg_modulus; }

  /**
   * A new addition to the config file - the number of times to reduce in [reduce](@ref reduce) function.
   */
-  static constexpr unsigned num_of_reductions() { return CONFIG::num_of_reductions; }
+  static constexpr HOST_DEVICE_INLINE unsigned num_of_reductions() { return CONFIG::num_of_reductions; }

  static constexpr unsigned slack_bits = 32 * TLC - NBITS;

  struct Wide {
    ff_wide_storage limbs_storage;

-    static constexpr Field get_lower(const Wide& xs)
+    static constexpr Wide HOST_DEVICE_INLINE from_field(const Field& xs)
    {
-      Field out{};
+      Wide out{};
 #ifdef __CUDA_ARCH__
-      
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
      return out;
    }

-    static constexpr Field get_higher(const Wide& xs)
+    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-      
+      UNROLL
+#endif
+      for (unsigned i = 0; i < TLC; i++)
+        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
+      return out;
+    }
+
+    static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
+    {
+      Field out{};
+#ifdef __CUDA_ARCH__
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
      return out;
    }

-    static constexpr Field get_higher_with_slack(const Wide& xs)
+    static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-      
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++) {
 #ifdef __CUDA_ARCH__
@@ -166,44 +181,44 @@ public:
    }

    template <unsigned REDUCTION_SIZE = 1>
-    static constexpr Wide sub_modulus_squared(const Wide& xs)
+    static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide& xs)
    {
      if (REDUCTION_SIZE == 0) return xs;
      const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
      Wide rs = {};
-      return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
+      return sub_limbs<2 * TLC, true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
    }

    template <unsigned MODULUS_MULTIPLE = 1>
-    static constexpr Wide neg(const Wide& xs)
+    static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
    {
      const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
      Wide rs = {};
-      sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
+      sub_limbs<2 * TLC, false>(modulus, xs.limbs_storage, rs.limbs_storage);
      return rs;
    }

-    friend Wide operator+(Wide xs, const Wide& ys)
+    friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
    {
      Wide rs = {};
-      add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+      add_limbs<2 * TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
      return sub_modulus_squared<1>(rs);
    }

-    friend Wide operator-(Wide xs, const Wide& ys)
+    friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
    {
      Wide rs = {};
-      uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+      uint32_t carry = sub_limbs<2 * TLC, true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
      if (carry == 0) return rs;
      const ff_wide_storage modulus = get_modulus_squared<1>();
-      add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
+      add_limbs<2 * TLC, false>(rs.limbs_storage, modulus, rs.limbs_storage);
      return rs;
    }
  };

  // return modulus multiplied by 1, 2 or 4
  template <unsigned MULTIPLIER = 1>
-  static constexpr ff_storage get_modulus()
+  static constexpr HOST_DEVICE_INLINE ff_storage get_modulus()
  {
    switch (MULTIPLIER) {
    case 1:
@@ -217,18 +232,12 @@ public:
    }
  }

-  template <unsigned MULTIPLIER = 1>
-  static constexpr ff_wide_storage modulus_wide()
-  {
-    return CONFIG::modulus_wide;
-  }
-
  // return m
-  static constexpr ff_storage get_m() { return CONFIG::m; }
+  static constexpr HOST_DEVICE_INLINE ff_storage get_m() { return CONFIG::m; }

  // return modulus^2, helpful for ab +/- cd
  template <unsigned MULTIPLIER = 1>
-  static constexpr ff_wide_storage get_modulus_squared()
+  static constexpr HOST_DEVICE_INLINE ff_wide_storage get_modulus_squared()
  {
    switch (MULTIPLIER) {
    case 1:
@@ -242,12 +251,11 @@ public:
    }
  }

-  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr uint32_t
-  add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
+  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
+  static constexpr DEVICE_INLINE uint32_t add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r)
  {
    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
-    for (unsigned i = 1; i < n; i++)
+    for (unsigned i = 1; i < NLIMBS; i++)
      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
    if (!CARRY_OUT) {
      ptx::addc(0, 0);
@@ -256,86 +264,50 @@ public:
    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
  }

-  // add or subtract limbs
-  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr uint32_t
-  add_sub_limbs_device(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
+  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
+  static constexpr DEVICE_INLINE uint32_t
+  add_sub_limbs_device(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
  {
    const uint32_t* x = xs.limbs;
    const uint32_t* y = ys.limbs;
    uint32_t* r = rs.limbs;
-    return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, TLC);
+    return add_sub_u32_device<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
  }

-  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr uint32_t
-  add_sub_limbs_device(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
-  {
-    const uint32_t* x = xs.limbs;
-    const uint32_t* y = ys.limbs;
-    uint32_t* r = rs.limbs;
-    return add_sub_u32_device<SUBTRACT, CARRY_OUT>(x, y, r, 2 * TLC);
-  }
-
-  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr uint32_t add_sub_limbs_host(const ff_storage& xs, const ff_storage& ys, ff_storage& rs)
-  {
-    const uint32_t* x = xs.limbs;
-    const uint32_t* y = ys.limbs;
-    uint32_t* r = rs.limbs;
-    uint32_t carry = 0;
-    host_math::carry_chain<TLC, false, CARRY_OUT> chain;
-    for (unsigned i = 0; i < TLC; i++)
-      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
-    return CARRY_OUT ? carry : 0;
-  }
-
-  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr uint32_t
-  add_sub_limbs_host(const ff_wide_storage& xs, const ff_wide_storage& ys, ff_wide_storage& rs)
-  {
-    const uint32_t* x = xs.limbs;
-    const uint32_t* y = ys.limbs;
-    uint32_t* r = rs.limbs;
-    uint32_t carry = 0;
-    host_math::carry_chain<2 * TLC, false, CARRY_OUT> chain;
-    for (unsigned i = 0; i < 2 * TLC; i++)
-      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
-    return CARRY_OUT ? carry : 0;
-  }
-
-  template <bool CARRY_OUT, typename T>
-  static constexpr uint32_t add_limbs(const T& xs, const T& ys, T& rs)
+  template <unsigned NLIMBS, bool CARRY_OUT>
+  static constexpr HOST_DEVICE_INLINE uint32_t
+  add_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
  {
 #ifdef __CUDA_ARCH__
-    return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
+    return add_sub_limbs_device<NLIMBS, false, CARRY_OUT>(xs, ys, rs);
 #else
-    return add_sub_limbs_host<false, CARRY_OUT>(xs, ys, rs);
+    return host_math::template add_sub_limbs<NLIMBS, false, CARRY_OUT>(xs, ys, rs);
 #endif
  }

-  template <bool CARRY_OUT, typename T>
-  static constexpr uint32_t sub_limbs(const T& xs, const T& ys, T& rs)
+  template <unsigned NLIMBS, bool CARRY_OUT>
+  static constexpr HOST_DEVICE_INLINE uint32_t
+  sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
  {
 #ifdef __CUDA_ARCH__
-    return add_sub_limbs_device<true, CARRY_OUT>(xs, ys, rs);
+    return add_sub_limbs_device<NLIMBS, true, CARRY_OUT>(xs, ys, rs);
 #else
-    return add_sub_limbs_host<true, CARRY_OUT>(xs, ys, rs);
+    return host_math::template add_sub_limbs<NLIMBS, true, CARRY_OUT>(xs, ys, rs);
 #endif
  }

-  static void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
-    
+    UNROLL
    for (size_t i = 0; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
    }
  }

-  static void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
+  static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
  {
-    
+    UNROLL
    for (size_t i = start_i; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -343,14 +315,14 @@ public:
  }

  template <bool CARRY_IN = false>
-  static void
+  static DEVICE_INLINE void
  cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
  {
    if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
    acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
    acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);

-    
+    UNROLL
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
@@ -358,7 +330,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (EVEN_PHASE) {
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -367,14 +339,14 @@ public:
      acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
    }

-    
+    UNROLL
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
    }
  }

-  static void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (n > 1)
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -382,7 +354,7 @@ public:
      acc[0] = ptx::mad_lo(a[0], bi, acc[0]);

    size_t i;
-    
+    UNROLL
    for (i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
      if (i == n - 2)
@@ -394,7 +366,7 @@ public:
  }

  template <bool CARRY_OUT = false, bool CARRY_IN = false>
-  static uint32_t mad_row(
+  static DEVICE_INLINE uint32_t mad_row(
    uint32_t* odd,
    uint32_t* even,
    const uint32_t* a,
@@ -419,7 +391,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
    odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
@@ -428,7 +400,7 @@ public:
    odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
  }

-  static void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    // bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
    // modulus)
@@ -439,12 +411,12 @@ public:
    return;
  }

-  static uint32_t
+  static DEVICE_INLINE uint32_t
  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
  {
    acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);

-    
+    UNROLL
    for (size_t i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
      acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
@@ -467,19 +439,19 @@ public:
   * \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
   * (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
   */
-  static void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
    if constexpr (TLC > 1) {
      const uint32_t* a = as.limbs;
      const uint32_t* b = bs.limbs;
      uint32_t* even = rs.limbs;
-      uint32_t odd[2 * TLC - 2];
+      __align__(16) uint32_t odd[2 * TLC - 2];

      even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
      odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
      odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
      size_t i;
-      
+      UNROLL
      for (i = 2; i < TLC - 1; i += 2) {
        mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
        mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
@@ -504,7 +476,7 @@ public:
   * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
   * limb products are included.
   */
-  static void
+  static DEVICE_INLINE void
  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
    ff_storage bs = get_neg_modulus();
@@ -514,13 +486,13 @@ public:
    uint32_t* even = rs.limbs;

    if constexpr (TLC > 2) {
-      uint32_t odd[TLC - 1];
+      __align__(16) uint32_t odd[TLC - 1];
      size_t i;
      // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
      // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
      // less efficient SASS generated by nvcc, so this case needed separate handling.
      if (b[0] == UINT32_MAX) {
-        add_sub_u32_device<true, false>(c, a, even, TLC);
+        add_sub_u32_device<TLC, true, false>(c, a, even);
        for (i = 0; i < TLC - 1; i++)
          odd[i] = a[i];
      } else {
@@ -528,6 +500,7 @@ public:
        mul_n(odd, a + 1, b[0], TLC - 1);
      }
      mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
+      UNROLL
      for (i = 2; i < TLC - 1; i += 2) {
        mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
        mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
@@ -557,15 +530,15 @@ public:
   * that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
   * result inside TLC limbs and ignore the carries from the highest limb.
   */
-  static void
+  static DEVICE_INLINE void
  multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
  {
-    uint32_t odd[TLC - 2];
+    __align__(16) uint32_t odd[TLC - 2];
    uint32_t first_row_carry = mul_n_and_add(even, a, b[0], in1);
    uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);

    size_t i;
-    
+    UNROLL
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      carry = mad_row<true, false>(
        &even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
@@ -586,15 +559,15 @@ public:
   * This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
   * long. The usual schoolbook algorithm is used.
   */
-  static void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
+  static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
  {
-    uint32_t odd[TLC - 2];
+    __align__(16) uint32_t odd[TLC - 2];
    mul_n(even, a, b[0], TLC >> 1);
    mul_n(odd, a + 1, b[0], TLC >> 1);
    mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);

    size_t i;
-    
+    UNROLL
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
@@ -613,7 +586,7 @@ public:
   * with so far. This method implements [subtractive
   * Karatsuba](https://en.wikipedia.org/wiki/Karatsuba_algorithm#Implementation).
   */
-  static void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static DEVICE_INLINE void multiply_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
@@ -623,27 +596,28 @@ public:
      // write the results into `r`.
      multiply_short_raw_device(a, b, r);
      multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
-      uint32_t middle_part[TLC];
-      uint32_t diffs[TLC];
+      __align__(16) uint32_t middle_part[TLC];
+      __align__(16) uint32_t diffs[TLC];
      // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
      // `carry1` and `carry2`.
-      uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
-      uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
+      uint32_t carry1 = add_sub_u32_device<(TLC >> 1), true, true>(&a[TLC >> 1], a, diffs);
+      uint32_t carry2 = add_sub_u32_device<(TLC >> 1), true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
      // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
      // This is where the assumption about unset high bit of `a` and `b` is relevant.
      multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
      // Corrections that need to be performed when differences are negative.
      // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
-      if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
-      if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
+      if (carry1)
+        add_sub_u32_device<(TLC >> 1), true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
+      if (carry2) add_sub_u32_device<(TLC >> 1), true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
      // Now that middle part is fully correct, it can be added to the result.
-      add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);
+      add_sub_u32_device<TLC, false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1]);

      // Carry from adding middle part has to be propagated to the highest limb.
      for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
        r[i] = ptx::addc_cc(r[i], 0);
    } else if (TLC == 2) {
-      uint32_t odd[2];
+      __align__(8) uint32_t odd[2];
      r[0] = ptx::mul_lo(a[0], b[0]);
      r[1] = ptx::mul_hi(a[0], b[0]);
      r[2] = ptx::mul_lo(a[1], b[1]);
@@ -661,56 +635,43 @@ public:
    }
  }

-  static void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
-  {
-    const uint32_t* a = as.limbs;
-    const uint32_t* b = bs.limbs;
-    uint32_t* r = rs.limbs;
-    for (unsigned i = 0; i < TLC; i++) {
-      uint32_t carry = 0;
-      for (unsigned j = 0; j < TLC; j++)
-        r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
-      r[TLC + i] = carry;
-    }
-  }
-
-  static void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static HOST_DEVICE_INLINE void multiply_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
 #ifdef __CUDA_ARCH__
    return multiply_raw_device(as, bs, rs);
 #else
-    return multiply_raw_host(as, bs, rs);
+    return host_math::template multiply_raw<TLC>(as, bs, rs);
 #endif
  }

-  static void
+  static HOST_DEVICE_INLINE void
  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
 #ifdef __CUDA_ARCH__
    return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
 #else
    Wide r_wide = {};
-    multiply_raw_host(as, get_neg_modulus(), r_wide.limbs_storage);
+    host_math::template multiply_raw<TLC>(as, get_neg_modulus(), r_wide.limbs_storage);
    Field r = Wide::get_lower(r_wide);
-    add_limbs<false>(cs, r.limbs_storage, rs);
+    add_limbs<TLC, false>(cs, r.limbs_storage, rs);
 #endif
  }

-  static void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static HOST_DEVICE_INLINE void multiply_msb_raw(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
 #ifdef __CUDA_ARCH__
    return multiply_msb_raw_device(as, bs, rs);
 #else
-    return multiply_raw_host(as, bs, rs);
+    return host_math::template multiply_raw<TLC>(as, bs, rs);
 #endif
  }

 public:
  ff_storage limbs_storage;

-  uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }
+  HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }

-  unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
  {
    const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
    const uint32_t shift_bits = (digit_num * digit_width) % 32;
@@ -722,7 +683,7 @@ public:
    return rv;
  }

-  static Field rand_host()
+  static HOST_INLINE Field rand_host()
  {
    std::random_device rd;
    std::mt19937_64 generator(rd());
@@ -742,12 +703,12 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr Field sub_modulus(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field& xs)
  {
    if (REDUCTION_SIZE == 0) return xs;
    const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
    Field rs = {};
-    return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
+    return sub_limbs<TLC, true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
  }

  friend std::ostream& operator<<(std::ostream& os, const Field& xs)
@@ -763,25 +724,25 @@ public:
    return os;
  }

-  friend Field operator+(Field xs, const Field& ys)
+  friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys)
  {
    Field rs = {};
-    add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+    add_limbs<TLC, false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
    return sub_modulus<1>(rs);
  }

-  friend Field operator-(Field xs, const Field& ys)
+  friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys)
  {
    Field rs = {};
-    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+    uint32_t carry = sub_limbs<TLC, true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
    if (carry == 0) return rs;
    const ff_storage modulus = get_modulus<1>();
-    add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
+    add_limbs<TLC, false>(rs.limbs_storage, modulus, rs.limbs_storage);
    return rs;
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Wide mul_wide(const Field& xs, const Field& ys)
+  static constexpr HOST_DEVICE_INLINE Wide mul_wide(const Field& xs, const Field& ys)
  {
    Wide rs = {};
    multiply_raw(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
@@ -810,7 +771,7 @@ public:
   * will cause only 1 reduction to be performed.
   */
  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Field reduce(const Wide& xs)
+  static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs)
  {
    // `xs` is left-shifted by `2 * slack_bits` and higher half is written to `xs_hi`
    Field xs_hi = Wide::get_higher_with_slack(xs);
@@ -826,28 +787,36 @@ public:
    uint32_t carry;
    // As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
    if (num_of_reductions() == 2) {
-      carry = sub_limbs<true>(r.limbs_storage, get_modulus<2>(), r_reduced);
+      carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<2>(), r_reduced);
      if (carry == 0) r = Field{r_reduced};
    }
-    carry = sub_limbs<true>(r.limbs_storage, get_modulus<1>(), r_reduced);
+    carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<1>(), r_reduced);
    if (carry == 0) r = Field{r_reduced};

    return r;
  }

-  friend Field operator*(const Field& xs, const Field& ys)
+  HOST_DEVICE_INLINE Field& operator=(Field const& other)
+  {
+    for (int i = 0; i < TLC; i++) {
+      this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
+    }
+    return *this;
+  }
+
+  friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
  {
    Wide xy = mul_wide(xs, ys); // full mult
    return reduce(xy);          // reduce mod p
  }

-  friend bool operator==(const Field& xs, const Field& ys)
+  friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys)
  {
 #ifdef __CUDA_ARCH__
    const uint32_t* x = xs.limbs_storage.limbs;
    const uint32_t* y = ys.limbs_storage.limbs;
    uint32_t limbs_or = x[0] ^ y[0];
-    
+    UNROLL
    for (unsigned i = 1; i < TLC; i++)
      limbs_or |= x[i] ^ y[i];
    return limbs_or == 0;
@@ -858,15 +827,15 @@ public:
 #endif
  }

-  friend bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
+  friend HOST_DEVICE_INLINE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }

  template <const Field& multiplier>
-  static Field mul_const(const Field& xs)
+  static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
  {
    Field mul = multiplier;
    static bool is_u32 = true;
 #ifdef __CUDA_ARCH__
-    
+    UNROLL
 #endif
    for (unsigned i = 1; i < TLC; i++)
      is_u32 &= (mul.limbs_storage.limbs[i] == 0);
@@ -876,13 +845,13 @@ public:
  }

  template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
-  static constexpr T mul_unsigned(const T& xs)
+  static constexpr HOST_DEVICE_INLINE T mul_unsigned(const T& xs)
  {
    T rs = {};
    T temp = xs;
    bool is_zero = true;
 #ifdef __CUDA_ARCH__
-    
+    UNROLL
 #endif
    for (unsigned i = 0; i < 32; i++) {
      if (multiplier & (1 << i)) {
@@ -896,45 +865,45 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Wide sqr_wide(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const Field& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Field sqr(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field sqr(const Field& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

-  static constexpr Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }
+  static constexpr HOST_DEVICE_INLINE Field to_montgomery(const Field& xs) { return xs * Field{CONFIG::montgomery_r}; }

-  static constexpr Field from_montgomery(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field from_montgomery(const Field& xs)
  {
    return xs * Field{CONFIG::montgomery_r_inv};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Field neg(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field neg(const Field& xs)
  {
    const ff_storage modulus = get_modulus<MODULUS_MULTIPLE>();
    Field rs = {};
-    sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
+    sub_limbs<TLC, false>(modulus, xs.limbs_storage, rs.limbs_storage);
    return rs;
  }

  // Assumes the number is even!
  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr Field div2(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
  {
    const uint32_t* x = xs.limbs_storage.limbs;
    Field rs = {};
    uint32_t* r = rs.limbs_storage.limbs;
    if constexpr (TLC > 1) {
 #ifdef __CUDA_ARCH__
-      
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC - 1; i++) {
 #ifdef __CUDA_ARCH__
@@ -948,18 +917,18 @@ public:
    return sub_modulus<MODULUS_MULTIPLE>(rs);
  }

-  static constexpr bool lt(const Field& xs, const Field& ys)
+  static constexpr HOST_DEVICE_INLINE bool lt(const Field& xs, const Field& ys)
  {
    ff_storage dummy = {};
-    uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, dummy);
+    uint32_t carry = sub_limbs<TLC, true>(xs.limbs_storage, ys.limbs_storage, dummy);
    return carry;
  }

-  static constexpr bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }
+  static constexpr HOST_DEVICE_INLINE bool is_odd(const Field& xs) { return xs.limbs_storage.limbs[0] & 1; }

-  static constexpr bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }
+  static constexpr HOST_DEVICE_INLINE bool is_even(const Field& xs) { return ~xs.limbs_storage.limbs[0] & 1; }

-  static constexpr Field inverse(const Field& xs)
+  static constexpr HOST_DEVICE_INLINE Field inverse(const Field& xs)
  {
    if (xs == zero()) return zero();
    constexpr Field one = Field{CONFIG::one};
@@ -971,12 +940,12 @@ public:
    while (!(u == one) && !(v == one)) {
      while (is_even(u)) {
        u = div2(u);
-        if (is_odd(b)) add_limbs<false>(b.limbs_storage, modulus, b.limbs_storage);
+        if (is_odd(b)) add_limbs<TLC, false>(b.limbs_storage, modulus, b.limbs_storage);
        b = div2(b);
      }
      while (is_even(v)) {
        v = div2(v);
-        if (is_odd(c)) add_limbs<false>(c.limbs_storage, modulus, c.limbs_storage);
+        if (is_odd(c)) add_limbs<TLC, false>(c.limbs_storage, modulus, c.limbs_storage);
        c = div2(c);
      }
      if (lt(v, u)) {
@@ -989,6 +958,17 @@ public:
    }
    return (u == one) ? b : c;
  }
+
+  static constexpr HOST_DEVICE_INLINE Field pow(Field base, int exp)
+  {
+    Field res = one();
+    while (exp > 0) {
+      if (exp & 1) res = res * base;
+      base = base * base;
+      exp >>= 1;
+    }
+    return res;
+  }
 };

 template <class CONFIG>
@@ -1006,9 +986,9 @@ struct std::hash<Field<CONFIG>> {

 template <class CONFIG>
 struct SharedMemory<Field<CONFIG>> {
-  Field<CONFIG>* getPointer()
+  __device__ Field<CONFIG>* getPointer()
  {
-    Field<CONFIG> *s_scalar_;
+    extern __shared__ Field<CONFIG> s_scalar_[];
    return s_scalar_;
  }
 };
--- a/icicle/include/fields/field_config.cuh
+++ b/icicle/include/fields/field_config.cuh
@@ -2,8 +2,8 @@
 #ifndef FIELD_CONFIG_H
 #define FIELD_CONFIG_H

-#include "id.h"
-#include "field.cuh"
+#include "fields/id.h"
+#include "fields/field.cuh"

 /**
 * @namespace field_config
@@ -11,28 +11,31 @@
 * with the `-DFIELD` env variable passed during build.
 */
 #if FIELD_ID == BN254
-#include "snark_fields/bn254_scalar.cuh"
+#include "fields/snark_fields/bn254_scalar.cuh"
 namespace field_config = bn254;
 #elif FIELD_ID == BLS12_381
-#include "snark_fields/bls12_381_scalar.cuh"
+#include "fields/snark_fields/bls12_381_scalar.cuh"
 using bls12_381::fp_config;
 namespace field_config = bls12_381;
 #elif FIELD_ID == BLS12_377
-#include "snark_fields/bls12_377_scalar.cuh"
+#include "fields/snark_fields/bls12_377_scalar.cuh"
 namespace field_config = bls12_377;
 #elif FIELD_ID == BW6_761
-#include "snark_fields/bw6_761_scalar.cuh"
+#include "fields/snark_fields/bw6_761_scalar.cuh"
 namespace field_config = bw6_761;
 #elif FIELD_ID == GRUMPKIN
-#include "snark_fields/grumpkin_scalar.cuh"
+#include "fields/snark_fields/grumpkin_scalar.cuh"
 namespace field_config = grumpkin;

 #elif FIELD_ID == BABY_BEAR
-#include "stark_fields/babybear.cuh"
+#include "fields/stark_fields/babybear.cuh"
 namespace field_config = babybear;
 #elif FIELD_ID == STARK_252
-#include "stark_fields/stark252.cuh"
+#include "fields/stark_fields/stark252.cuh"
 namespace field_config = stark252;
+#elif FIELD_ID == M31
+#include "fields/stark_fields/m31.cuh"
+namespace field_config = m31;
 #endif

 #endif
--- a/icicle/include/fields/host_math.cuh
+++ b/icicle/include/fields/host_math.cuh
@@ -3,97 +3,186 @@
 #define HOST_MATH_H

 #include <cstdint>
-#include "../gpu-utils/modifiers.cuh"
+#include <cuda_runtime.h>
+#include "gpu-utils/modifiers.cuh"
+#include "storage.cuh"
+
 namespace host_math {

- // return x + y with uint32_t operands
- static uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
+  // return x + y with uint32_t operands
+  static constexpr __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }

- // return x + y + carry with uint32_t operands
- static uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry) { return x + y + carry; }
-
- // return x + y and carry out with uint32_t operands
- static uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
- {
-  uint32_t result;
-  result = x + y;
-  carry = x > result;
-  return result;
- }
-
- // return x + y + carry and carry out with uint32_t operands
- static uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
- {
-  const uint32_t result = x + y + carry;
-  carry = carry && x >= result || !carry && x > result;
-  return result;
- }
-
- // return x - y with uint32_t operands
- static uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
-
- // 	return x - y - borrow with uint32_t operands
- static uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow) { return x - y - borrow; }
-
- //	return x - y and borrow out with uint32_t operands
- static uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
- {
-  uint32_t result;
-  result = x - y;
-  borrow = x < result;
-  return result;
- }
-
- //	return x - y - borrow and borrow out with uint32_t operands
- static uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
- {
-  const uint32_t result = x - y - borrow;
-  borrow = borrow && x <= result || !borrow && x < result;
-  return result;
- }
-
- // return x * y + z + carry and carry out with uint32_t operands
- static uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
- {
-  uint32_t result;
-  uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
-  carry = (uint32_t)(r >> 32);
-  result = r & 0xffffffff;
-  return result;
- }
-
- template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
- struct carry_chain {
-  unsigned index;
-
-  constexpr  carry_chain() : index(0) {}
-
-   uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+  // return x + y + carry with uint32_t operands
+  static constexpr __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry)
  {
-   index++;
-   if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-    return host_math::add(x, y);
-   else if (index == 1 && !CARRY_IN)
-    return host_math::add_cc(x, y, carry);
-   else if (index < OPS_COUNT || CARRY_OUT)
-    return host_math::addc_cc(x, y, carry);
-   else
-    return host_math::addc(x, y, carry);
+    return x + y + carry;
  }

-   uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+  // return x + y and carry out with uint32_t operands
+  static constexpr __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
  {
-   index++;
-   if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
-    return host_math::sub(x, y);
-   else if (index == 1 && !CARRY_IN)
-    return host_math::sub_cc(x, y, carry);
-   else if (index < OPS_COUNT || CARRY_OUT)
-    return host_math::subc_cc(x, y, carry);
-   else
-    return host_math::subc(x, y, carry);
+    uint32_t result = x + y;
+    carry = x > result;
+    return result;
+  }
+
+  // return x + y + carry and carry out  with uint32_t operands
+  static constexpr __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  {
+    const uint32_t result = x + y + carry;
+    carry = carry && x >= result || !carry && x > result;
+    return result;
+  }
+
+  // return x - y with uint32_t operands
+  static constexpr __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
+
+  //    return x - y - borrow with uint32_t operands
+  static constexpr __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow)
+  {
+    return x - y - borrow;
+  }
+
+  //    return x - y and borrow out with uint32_t operands
+  static constexpr __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    uint32_t result = x - y;
+    borrow = x < result;
+    return result;
+  }
+
+  //    return x - y - borrow and borrow out with uint32_t operands
+  static constexpr __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  {
+    const uint32_t result = x - y - borrow;
+    borrow = borrow && x <= result || !borrow && x < result;
+    return result;
+  }
+
+  // return x * y + z + carry and carry out with uint32_t operands
+  static constexpr __host__ uint32_t madc_cc(const uint32_t x, const uint32_t y, const uint32_t z, uint32_t& carry)
+  {
+    uint64_t r = static_cast<uint64_t>(x) * y + z + carry;
+    carry = (uint32_t)(r >> 32);
+    uint32_t result = r & 0xffffffff;
+    return result;
+  }
+
+  template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
+  struct carry_chain {
+    unsigned index;
+
+    constexpr HOST_INLINE carry_chain() : index(0) {}
+
+    constexpr HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::add(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::add_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::addc_cc(x, y, carry);
+      else
+        return host_math::addc(x, y, carry);
+    }
+
+    constexpr HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+    {
+      index++;
+      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
+        return host_math::sub(x, y);
+      else if (index == 1 && !CARRY_IN)
+        return host_math::sub_cc(x, y, carry);
+      else if (index < OPS_COUNT || CARRY_OUT)
+        return host_math::subc_cc(x, y, carry);
+      else
+        return host_math::subc(x, y, carry);
+    }
+  };
+
+  template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
+  static constexpr HOST_INLINE void
+  multiply_raw(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
+  {
+    const uint32_t* a = as.limbs;
+    const uint32_t* b = bs.limbs;
+    uint32_t* r = rs.limbs;
+    for (unsigned i = 0; i < NLIMBS_B; i++) {
+      uint32_t carry = 0;
+      for (unsigned j = 0; j < NLIMBS_A; j++)
+        r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
+      r[NLIMBS_A + i] = carry;
+    }
+  }
+
+  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
+  static constexpr HOST_INLINE uint32_t
+  add_sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
+  {
+    const uint32_t* x = xs.limbs;
+    const uint32_t* y = ys.limbs;
+    uint32_t* r = rs.limbs;
+    uint32_t carry = 0;
+    carry_chain<NLIMBS, false, CARRY_OUT> chain;
+    for (unsigned i = 0; i < NLIMBS; i++)
+      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
+    return CARRY_OUT ? carry : 0;
+  }
+
+  template <unsigned NLIMBS, unsigned BITS>
+  static constexpr HOST_INLINE storage<NLIMBS> left_shift(const storage<NLIMBS>& xs)
+  {
+    if constexpr (BITS == 0)
+      return xs;
+    else {
+      constexpr unsigned BITS32 = BITS % 32;
+      constexpr unsigned LIMBS_GAP = BITS / 32;
+      storage<NLIMBS> out{};
+      if constexpr (LIMBS_GAP < NLIMBS) {
+        out.limbs[LIMBS_GAP] = xs.limbs[0] << BITS32;
+        for (unsigned i = 1; i < NLIMBS - LIMBS_GAP; i++)
+          out.limbs[i + LIMBS_GAP] = (xs.limbs[i] << BITS32) + (xs.limbs[i - 1] >> (32 - BITS32));
+      }
+      return out;
+    }
+  }
+
+  template <unsigned NLIMBS, unsigned BITS>
+  static constexpr HOST_INLINE storage<NLIMBS> right_shift(const storage<NLIMBS>& xs)
+  {
+    if constexpr (BITS == 0)
+      return xs;
+    else {
+      constexpr unsigned BITS32 = BITS % 32;
+      constexpr unsigned LIMBS_GAP = BITS / 32;
+      storage<NLIMBS> out{};
+      if constexpr (LIMBS_GAP < NLIMBS - 1) {
+        for (unsigned i = 0; i < NLIMBS - LIMBS_GAP - 1; i++)
+          out.limbs[i] = (xs.limbs[i + LIMBS_GAP] >> BITS32) + (xs.limbs[i + LIMBS_GAP + 1] << (32 - BITS32));
+      }
+      if constexpr (LIMBS_GAP < NLIMBS) out.limbs[NLIMBS - LIMBS_GAP - 1] = (xs.limbs[NLIMBS - 1] >> BITS32);
+      return out;
+    }
+  }
+
+  template <unsigned NLIMBS_NUM, unsigned NLIMBS_DENOM, unsigned NLIMBS_Q = (NLIMBS_NUM - NLIMBS_DENOM)>
+  static constexpr HOST_INLINE void integer_division(
+    const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
+  {
+    storage<NLIMBS_DENOM> temp = {};
+    for (int limb_idx = NLIMBS_NUM - 1; limb_idx >= 0; limb_idx--) {
+      for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
+        r = left_shift<NLIMBS_DENOM, 1>(r);
+        r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
+        uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true>(r, denom, temp);
+        if (limb_idx < NLIMBS_Q & !c) {
+          r = temp;
+          q.limbs[limb_idx] |= 1 << bit_idx;
+        }
+      }
+    }
  }
- };
 } // namespace host_math

-#endif
+#endif
--- a/icicle/include/fields/id.h
+++ b/icicle/include/fields/id.h
@@ -10,5 +10,6 @@

 #define BABY_BEAR 1001
 #define STARK_252 1002
+#define M31       1003

 #endif
--- a/icicle/include/fields/params_gen.cuh
+++ b/icicle/include/fields/params_gen.cuh
@@ -0,0 +1,129 @@
+#pragma once
+#ifndef PARAMS_GEN_H
+#define PARAMS_GEN_H
+
+#include "storage.cuh"
+#include "host_math.cuh"
+
+namespace params_gen {
+  template <unsigned NLIMBS, unsigned BIT_SHIFT>
+  static constexpr HOST_INLINE storage<2 * NLIMBS> get_square(const storage<NLIMBS>& xs)
+  {
+    storage<2 * NLIMBS> rs = {};
+    host_math::template multiply_raw<NLIMBS>(xs, xs, rs);
+    return host_math::template left_shift<2 * NLIMBS, BIT_SHIFT>(rs);
+  }
+
+  template <unsigned NLIMBS>
+  static constexpr HOST_INLINE storage<NLIMBS>
+  get_difference_no_carry(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys)
+  {
+    storage<NLIMBS> rs = {};
+    host_math::template add_sub_limbs<NLIMBS, true, false>(xs, ys, rs);
+    return rs;
+  }
+
+  template <unsigned NLIMBS, unsigned EXP>
+  static constexpr HOST_INLINE storage<NLIMBS> get_m(const storage<NLIMBS>& modulus)
+  {
+    storage<NLIMBS> rs = {};
+    storage<NLIMBS> qs = {};
+    storage<2 * NLIMBS> wide_one = {1};
+    storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, EXP>(wide_one);
+    host_math::template integer_division<2 * NLIMBS, NLIMBS>(pow_of_2, modulus, qs, rs);
+    return qs;
+  }
+
+  template <unsigned NLIMBS, bool INV>
+  static constexpr HOST_INLINE storage<NLIMBS> get_montgomery_constant(const storage<NLIMBS>& modulus)
+  {
+    storage<NLIMBS> rs = {1};
+    for (int i = 0; i < 32 * NLIMBS; i++) {
+      if (INV) {
+        if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
+        rs = host_math::template right_shift<NLIMBS, 1>(rs);
+      } else {
+        rs = host_math::template left_shift<NLIMBS, 1>(rs);
+        storage<NLIMBS> temp = {};
+        rs = host_math::template add_sub_limbs<NLIMBS, true, true>(rs, modulus, temp) ? rs : temp;
+      }
+    }
+    return rs;
+  }
+
+  constexpr unsigned floorlog2(uint32_t x) { return x == 1 ? 0 : 1 + floorlog2(x >> 1); }
+
+  template <unsigned NLIMBS, unsigned NBITS>
+  constexpr unsigned num_of_reductions(const storage<NLIMBS>& modulus, const storage<NLIMBS>& m)
+  {
+    storage<2 * NLIMBS> x1 = {};
+    storage<3 * NLIMBS> x2 = {};
+    storage<3 * NLIMBS> x3 = {};
+    host_math::template multiply_raw<NLIMBS>(modulus, m, x1);
+    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, x1, x2);
+    storage<2 * NLIMBS> one = {1};
+    storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, NBITS>(one);
+    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, pow_of_2, x3);
+    host_math::template add_sub_limbs<3 * NLIMBS, true, false>(x3, x2, x2);
+    double err = (double)x2.limbs[2 * NLIMBS - 1] / pow_of_2.limbs[2 * NLIMBS - 1];
+    err += (double)m.limbs[NLIMBS - 1] / 0xffffffff;
+    err += (double)NLIMBS / 0x80000000;
+    return unsigned(err) + 1;
+  }
+
+  template <unsigned NLIMBS>
+  constexpr unsigned two_adicity(const storage<NLIMBS>& modulus)
+  {
+    unsigned two_adicity = 1;
+    storage<NLIMBS> temp = host_math::template right_shift<NLIMBS, 1>(modulus);
+    while (!(temp.limbs[0] & 1)) {
+      temp = host_math::template right_shift<NLIMBS, 1>(temp);
+      two_adicity++;
+    }
+    return two_adicity;
+  }
+
+  template <unsigned NLIMBS, unsigned TWO_ADICITY>
+  constexpr storage_array<TWO_ADICITY, NLIMBS> get_invs(const storage<NLIMBS>& modulus)
+  {
+    storage_array<TWO_ADICITY, NLIMBS> invs = {};
+    storage<NLIMBS> rs = {1};
+    for (int i = 0; i < TWO_ADICITY; i++) {
+      if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
+      rs = host_math::template right_shift<NLIMBS, 1>(rs);
+      invs.storages[i] = rs;
+    }
+    return invs;
+  }
+} // namespace params_gen
+
+#define PARAMS(modulus)                                                                                                \
+  static constexpr unsigned limbs_count = modulus.LC;                                                                  \
+  static constexpr unsigned modulus_bit_count =                                                                        \
+    32 * (limbs_count - 1) + params_gen::floorlog2(modulus.limbs[limbs_count - 1]) + 1;                                \
+  static constexpr storage<limbs_count> zero = {};                                                                     \
+  static constexpr storage<limbs_count> one = {1};                                                                     \
+  static constexpr storage<limbs_count> modulus_2 = host_math::template left_shift<limbs_count, 1>(modulus);           \
+  static constexpr storage<limbs_count> modulus_4 = host_math::template left_shift<limbs_count, 1>(modulus_2);         \
+  static constexpr storage<limbs_count> neg_modulus =                                                                  \
+    params_gen::template get_difference_no_carry<limbs_count>(zero, modulus);                                          \
+  static constexpr storage<2 * limbs_count> modulus_squared =                                                          \
+    params_gen::template get_square<limbs_count, 0>(modulus);                                                          \
+  static constexpr storage<2 * limbs_count> modulus_squared_2 =                                                        \
+    host_math::template left_shift<2 * limbs_count, 1>(modulus_squared);                                               \
+  static constexpr storage<2 * limbs_count> modulus_squared_4 =                                                        \
+    host_math::template left_shift<2 * limbs_count, 1>(modulus_squared_2);                                             \
+  static constexpr storage<limbs_count> m = params_gen::template get_m<limbs_count, 2 * modulus_bit_count>(modulus);   \
+  static constexpr storage<limbs_count> montgomery_r =                                                                 \
+    params_gen::template get_montgomery_constant<limbs_count, false>(modulus);                                         \
+  static constexpr storage<limbs_count> montgomery_r_inv =                                                             \
+    params_gen::template get_montgomery_constant<limbs_count, true>(modulus);                                          \
+  static constexpr unsigned num_of_reductions =                                                                        \
+    params_gen::template num_of_reductions<limbs_count, 2 * modulus_bit_count>(modulus, m);
+
+#define TWIDDLES(modulus, rou)                                                                                         \
+  static constexpr unsigned omegas_count = params_gen::template two_adicity<limbs_count>(modulus);                     \
+  static constexpr storage_array<omegas_count, limbs_count> inv =                                                      \
+    params_gen::template get_invs<limbs_count, omegas_count>(modulus);
+
+#endif
--- a/icicle/include/fields/ptx.cuh
+++ b/icicle/include/fields/ptx.cuh
@@ -1,119 +1,139 @@
 #pragma once
 #include <cstdint>
+#include <cuda_runtime.h>

 namespace ptx {

-   uint32_t add(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y)
  {
-    uint32_t result = 0;
+    uint32_t result;
+    asm("add.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t add_cc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t add_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result = 0;
+    uint32_t result;
+    asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t addc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t addc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result = 0;
+    uint32_t result;
+    asm volatile("addc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t addc_cc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t addc_cc(const uint32_t x, const uint32_t y)
  {
-    uint32_t result = 0;
+    uint32_t result;
+    asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t sub(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y)
  {
-    uint32_t result = 0;
+    uint32_t result;
+    asm("sub.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t sub_cc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t sub_cc(const uint32_t x, const uint32_t y)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("sub.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t subc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t subc(const uint32_t x, const uint32_t y)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("subc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t subc_cc(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t subc_cc(const uint32_t x, const uint32_t y)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("subc.cc.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t mul_lo(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t mul_lo(const uint32_t x, const uint32_t y)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t mul_hi(const uint32_t x, const uint32_t y)
+  __device__ __forceinline__ uint32_t mul_hi(const uint32_t x, const uint32_t y)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm("mul.hi.u32 %0, %1, %2;" : "=r"(result) : "r"(x), "r"(y));
    return result;
  }

-   uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t mad_lo(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm("mad.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t mad_hi(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm("mad.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t mad_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t mad_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t madc_lo(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("madc.lo.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t madc_hi(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t madc_lo_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
+  __device__ __forceinline__ uint32_t madc_hi_cc(const uint32_t x, const uint32_t y, const uint32_t z)
  {
-     uint32_t result = 0;
+    uint32_t result;
+    asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(result) : "r"(x), "r"(y), "r"(z));
    return result;
  }

-   uint64_t mov_b64(uint32_t lo, uint32_t hi)
+  __device__ __forceinline__ uint64_t mov_b64(uint32_t lo, uint32_t hi)
  {
-    uint64_t result = 0;
+    uint64_t result;
+    asm("mov.b64 %0, {%1,%2};" : "=l"(result) : "r"(lo), "r"(hi));
    return result;
  }

@@ -121,124 +141,142 @@ namespace ptx {
  // Callers should know exactly what they're calling (no implicit conversions).
  namespace u64 {

-     uint64_t add(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t add(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("add.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t add_cc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t add_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t addc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t addc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("addc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t addc_cc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t addc_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t sub(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t sub(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("sub.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t sub_cc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t sub_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t subc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t subc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("subc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t subc_cc(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t subc_cc(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t mul_lo(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t mul_lo(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("mul.lo.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t mul_hi(const uint64_t x, const uint64_t y)
+    __device__ __forceinline__ uint64_t mul_hi(const uint64_t x, const uint64_t y)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("mul.hi.u64 %0, %1, %2;" : "=l"(result) : "l"(x), "l"(y));
      return result;
    }

-     uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t mad_lo(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("mad.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t mad_hi(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm("mad.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t mad_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("mad.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t mad_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("mad.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t madc_lo(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("madc.lo.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t madc_hi(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("madc.hi.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t madc_lo_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("madc.lo.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

-     uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
+    __device__ __forceinline__ uint64_t madc_hi_cc(const uint64_t x, const uint64_t y, const uint64_t z)
    {
-      uint64_t result = 0;
+      uint64_t result;
+      asm volatile("madc.hi.cc.u64 %0, %1, %2, %3;" : "=l"(result) : "l"(x), "l"(y), "l"(z));
      return result;
    }

  } // namespace u64

-   void bar_arrive(const unsigned name, const unsigned count)
+  __device__ __forceinline__ void bar_arrive(const unsigned name, const unsigned count)
  {
-    return;
+    asm volatile("bar.arrive %0, %1;" : : "r"(name), "r"(count) : "memory");
  }

-   void bar_sync(const unsigned name, const unsigned count)
+  __device__ __forceinline__ void bar_sync(const unsigned name, const unsigned count)
  {
-    return;
+    asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(count) : "memory");
  }

 } // namespace ptx
--- a/icicle/include/fields/quadratic_extension.cuh
+++ b/icicle/include/fields/quadratic_extension.cuh
@@ -1,54 +1,54 @@
 #pragma once

 #include "field.cuh"
-#include "../gpu-utils/modifiers.cuh"
-#include "../gpu-utils/sharedmem.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"

-template <typename CONFIG>
+template <typename CONFIG, class T>
 class ExtensionField
 {
 private:
-  friend Field<CONFIG>;
+  friend T;

-  typedef typename Field<CONFIG>::Wide FWide;
+  typedef typename T::Wide FWide;

  struct ExtensionWide {
    FWide real;
    FWide imaginary;

-    friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real + ys.real, xs.imaginary + ys.imaginary};
    }

-    friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real - ys.real, xs.imaginary - ys.imaginary};
    }
  };

 public:
-  typedef Field<CONFIG> FF;
+  typedef T FF;
  static constexpr unsigned TLC = 2 * CONFIG::limbs_count;

  FF real;
  FF imaginary;

-  static constexpr ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero() { return ExtensionField{FF::zero(), FF::zero()}; }

-  static constexpr ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }
+  static constexpr HOST_DEVICE_INLINE ExtensionField one() { return ExtensionField{FF::one(), FF::zero()}; }

-  static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{xs.real * FF{CONFIG::montgomery_r}, xs.imaginary * FF{CONFIG::montgomery_r}};
  }

-  static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
  }

-  static ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }
+  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }

  static void rand_host_many(ExtensionField* out, int size)
  {
@@ -57,7 +57,7 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
  {
    return ExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
  }
@@ -68,38 +68,38 @@ public:
    return os;
  }

-  friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real + ys.real, xs.imaginary + ys.imaginary};
  }

-  friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
  }

-  friend ExtensionField operator+(FF xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs + ys.real, ys.imaginary};
  }

-  friend ExtensionField operator-(FF xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
  }

-  friend ExtensionField operator+(ExtensionField xs, const FF& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real + ys, xs.imaginary};
  }

-  friend ExtensionField operator-(ExtensionField xs, const FF& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real - ys, xs.imaginary};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    FWide real_prod = FF::mul_wide(xs.real, ys.real);
    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
@@ -110,40 +110,40 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
  {
    return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
  {
    return mul_wide(ys, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField reduce(const ExtensionWide& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
  {
    return ExtensionField{
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
  }

  template <class T1, class T2>
-  friend ExtensionField operator*(const T1& xs, const T2& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
  }

-  friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
  {
    return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
  }

-  friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }

  template <const ExtensionField& multiplier>
-  static ExtensionField mul_const(const ExtensionField& xs)
+  static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField& xs)
  {
    static constexpr FF mul_real = multiplier.real;
    static constexpr FF mul_imaginary = multiplier.imaginary;
@@ -159,33 +159,33 @@ public:
  }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
-  static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
  {
    return {FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.imaginary)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField sqr(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField neg(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
  {
    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
  }

  // inverse of zero is set to be zero which is what we want most of the time
-  static constexpr ExtensionField inverse(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
  {
    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
@@ -196,11 +196,11 @@ public:
  }
 };

-template <class CONFIG>
-struct SharedMemory<ExtensionField<CONFIG>> {
-  ExtensionField<CONFIG>* getPointer()
+template <typename CONFIG, class T>
+struct SharedMemory<ExtensionField<CONFIG, T>> {
+  __device__ ExtensionField<CONFIG, T>* getPointer()
  {
-    ExtensionField<CONFIG> *s_ext2_scalar_;
+    extern __shared__ ExtensionField<CONFIG, T> s_ext2_scalar_[];
    return s_ext2_scalar_;
  }
 };
--- a/icicle/include/fields/quartic_extension.cuh
+++ b/icicle/include/fields/quartic_extension.cuh
@@ -1,14 +1,14 @@
 #pragma once

 #include "field.cuh"
-#include "../gpu-utils/modifiers.cuh"
-#include "../gpu-utils/sharedmem.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"

-template <typename CONFIG>
+template <typename CONFIG, class T>
 class ExtensionField
 {
 private:
-  typedef typename Field<CONFIG>::Wide FWide;
+  typedef typename T::Wide FWide;

  struct ExtensionWide {
    FWide real;
@@ -16,19 +16,19 @@ private:
    FWide im2;
    FWide im3;

-    friend ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
    }

-    friend ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
    {
      return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
    }
  };

 public:
-  typedef Field<CONFIG> FF;
+  typedef T FF;
  static constexpr unsigned TLC = 4 * CONFIG::limbs_count;

  FF real;
@@ -36,31 +36,30 @@ public:
  FF im2;
  FF im3;

-  static constexpr ExtensionField zero()
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero()
  {
    return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
  }

-  static constexpr ExtensionField one()
+  static constexpr HOST_DEVICE_INLINE ExtensionField one()
  {
    return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
  }

-  static constexpr ExtensionField to_montgomery(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{
-      xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
-      xs.im3 * FF{CONFIG::montgomery_r}};
+      FF::to_montgomery(xs.real), FF::to_montgomery(xs.im1), FF::to_montgomery(xs.im2), FF::to_montgomery(xs.im3)};
  }

-  static constexpr ExtensionField from_montgomery(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs)
  {
    return ExtensionField{
-      xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
-      xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
+      FF::from_montgomery(xs.real), FF::from_montgomery(xs.im1), FF::from_montgomery(xs.im2),
+      FF::from_montgomery(xs.im3)};
  }

-  static ExtensionField rand_host()
+  static HOST_INLINE ExtensionField rand_host()
  {
    return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
  }
@@ -72,7 +71,7 @@ public:
  }

  template <unsigned REDUCTION_SIZE = 1>
-  static constexpr ExtensionField sub_modulus(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
  {
    return ExtensionField{
      FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
@@ -86,38 +85,38 @@ public:
    return os;
  }

-  friend ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
  }

-  friend ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
  {
    return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
  }

-  friend ExtensionField operator+(FF xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
  }

-  friend ExtensionField operator-(FF xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
  {
    return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
  }

-  friend ExtensionField operator+(ExtensionField xs, const FF& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
  }

-  friend ExtensionField operator-(ExtensionField xs, const FF& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
  {
    return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    if (CONFIG::nonresidue_is_negative)
      return ExtensionWide{
@@ -144,21 +143,21 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
  {
    return ExtensionWide{
      FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
  {
    return ExtensionWide{
      FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField reduce(const ExtensionWide& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
  {
    return ExtensionField{
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
@@ -166,21 +165,21 @@ public:
  }

  template <class T1, class T2>
-  friend ExtensionField operator*(const T1& xs, const T2& ys)
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
  }

-  friend bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
  {
    return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
  }

-  friend bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
-  static constexpr ExtensionField mul_unsigned(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
  {
    return {
      FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
@@ -188,27 +187,27 @@ public:
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionWide sqr_wide(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField sqr(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
  {
    // TODO: change to a more efficient squaring
    return xs * xs;
  }

  template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr ExtensionField neg(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
  {
    return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
  }

  // inverse of zero is set to be zero which is what we want most of the time
-  static constexpr ExtensionField inverse(const ExtensionField& xs)
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
  {
    FF x, x0, x2;
    if (CONFIG::nonresidue_is_negative) {
@@ -247,11 +246,11 @@ public:
  }
 };

-template <class CONFIG>
-struct SharedMemory<ExtensionField<CONFIG>> {
-  ExtensionField<CONFIG>* getPointer()
+template <class CONFIG, class T>
+struct SharedMemory<ExtensionField<CONFIG, T>> {
+  __device__ ExtensionField<CONFIG, T>* getPointer()
  {
-    ExtensionField<CONFIG> *s_ext4_scalar_=nullptr;
+    extern __shared__ ExtensionField<CONFIG, T> s_ext4_scalar_[];
    return s_ext4_scalar_;
  }
 };
--- a/icicle/include/fields/snark_fields/bls12_377_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_base.cuh
@@ -3,337 +3,17 @@
 #define BLS12_377_BASE_PARAMS_H

 #include "fields/storage.cuh"
+#include "fields/params_gen.cuh"

 namespace bls12_377 {
  struct fq_config {
-    static constexpr unsigned limbs_count = 12;
-    static constexpr unsigned omegas_count = 48;
-    static constexpr unsigned modulus_bit_count = 377;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44,
-                                                     0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
-                                                     0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x0a118000, 0x60000001, 0x2e16ba88,
-                                                       0x74129000, 0x3de6c45f, 0x01ea271e, 0x3445b3e6,
-                                                       0xd9429276, 0x8c760b80, 0x2f8a21d5, 0x035c748c};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x14230000, 0xc0000002, 0x5c2d7510,
-                                                       0xe8252000, 0x7bcd88be, 0x03d44e3c, 0x688b67cc,
-                                                       0xb28524ec, 0x18ec1701, 0x5f1443ab, 0x06b8e918};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0x7af73fff, 0xcfffffff, 0xe8f4a2bb,
-                                                         0x45f6b7ff, 0xe10c9dd0, 0xff0aec70, 0xe5dd260c,
-                                                         0x935eb6c4, 0x39c4fa3f, 0xe83aef15, 0xfe51c5b9};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3,
-      0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0x0a118000, 0xf0000001, 0x7338d254, 0x2e1bd800, 0x4ada268f, 0x35f1c09a, 0x6bcbfbd2,
-      0x58638c9d, 0x318324b9, 0x8bb70ae0, 0x460aaaaa, 0x502a4d6c, 0xc014e712, 0xb90660cd, 0x09d018af,
-      0x3dda4d5c, 0x1f5e7141, 0xa4aee93f, 0x4bb8b87d, 0xb361263c, 0x2256913b, 0xd0bbaffb, 0x0002d307};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0x14230000, 0xe0000002, 0xe671a4a9, 0x5c37b000, 0x95b44d1e, 0x6be38134, 0xd797f7a4,
-      0xb0c7193a, 0x63064972, 0x176e15c0, 0x8c155555, 0xa0549ad8, 0x8029ce24, 0x720cc19b, 0x13a0315f,
-      0x7bb49ab8, 0x3ebce282, 0x495dd27e, 0x977170fb, 0x66c24c78, 0x44ad2277, 0xa1775ff6, 0x0005a60f};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0x28460000, 0xc0000004, 0xcce34953, 0xb86f6001, 0x2b689a3c, 0xd7c70269, 0xaf2fef48,
-      0x618e3275, 0xc60c92e5, 0x2edc2b80, 0x182aaaaa, 0x40a935b1, 0x00539c49, 0xe4198337, 0x274062be,
-      0xf7693570, 0x7d79c504, 0x92bba4fc, 0x2ee2e1f6, 0xcd8498f1, 0x895a44ee, 0x42eebfec, 0x000b4c1f};
-    static constexpr storage<limbs_count> m = {0x5e4daffc, 0x1f9fd58c, 0x89c42a59, 0xd0ed6877, 0xd85a6d02, 0x6af2d488,
-                                               0x6776b1a0, 0x3bbad0de, 0x582ef4f7, 0x976c3ca0, 0x0cc4060e, 0x0261508d};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xffffff68, 0x02cdffff, 0x7fffffb1, 0x51409f83,
-                                                          0x8a7d3ff2, 0x9f7db3a9, 0x6e7c6305, 0x7b4e97b7,
-                                                          0x803c84e8, 0x4cf495bf, 0xe2fdf49a, 0x008d6661};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x451269e8, 0xef129093, 0xe65839f5, 0x6e20bbcd,
-                                                              0xa5582c93, 0x852e3c88, 0xf7f2e657, 0xeeaaf41d,
-                                                              0xa4c49351, 0xeb89746c, 0x436b0736, 0x014212fc};
+    static constexpr storage<12> modulus = {0x00000001, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f,
+                                            0x00f5138f, 0x1a22d9f3, 0x6ca1493b, 0xc63b05c0, 0x17c510ea, 0x01ae3a46};
+    PARAMS(modulus)

-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b,
-        0xc63b05c0, 0x17c510ea, 0x01ae3a46},
-       {0xf1391c63, 0x6e76d5ec, 0xbff27d8e, 0x99588459, 0x436b0f62, 0xbce649cf, 0x0ad1dec1, 0x400398f5, 0x1a79beb1,
-        0xc0c534db, 0x796537ca, 0x01680a40},
-       {0x554c85ba, 0x6cbff0e3, 0x0be8ff9d, 0xc07c7a91, 0x9dde4fa2, 0xc3c79f67, 0xb5726bde, 0x44bc6d1a, 0x76d6d607,
-        0xad812919, 0x95e8fd0e, 0x001bc0c2},
-       {0x6d5db237, 0xb8c206b0, 0xcabde6ba, 0x08fed85d, 0xcd92eb6f, 0xf2f54ffc, 0xe39c1788, 0xee81121f, 0x88e82edb,
-        0x852def4d, 0xb95fdb80, 0x00bf1268},
-       {0x192bf14f, 0x3663c26a, 0xe6351854, 0x99c859be, 0x159361b8, 0xf9430828, 0xfbe33d7d, 0x478ed715, 0xdb79c984,
-        0x41e220cf, 0xd961f2be, 0x00cedb38},
-       {0xcc724685, 0xb99caa69, 0x1388a46d, 0xc24087ba, 0x08f03491, 0xeb13a05a, 0x98fb0ff7, 0x558ab21e, 0x86bbd802,
-        0x0166d08d, 0xf5b5728a, 0x00d1dec9},
-       {0x92db32a2, 0x2e3951fe, 0x6014b201, 0x8f5a16c9, 0xa91fbb38, 0xa9e942b9, 0x17b4dbd2, 0xf7bf5b43, 0x81325c7d,
-        0x57f3934a, 0x615ad019, 0x012be78e},
-       {0xdce33f04, 0xb42b84a2, 0x0db0b91c, 0x7a0c1423, 0x88d9f8c8, 0xaed11a0c, 0xd484c501, 0x712d6bc0, 0xfa3f7633,
-        0x50aca1e5, 0xb90f34d0, 0x01002f29},
-       {0xf012f6a0, 0xbc3db054, 0x0d332ea7, 0x00d66897, 0xfd416167, 0x8278ef44, 0x20268e84, 0x1a1a3c4d, 0x4b11d215,
-        0x7c976aa6, 0x63b6e925, 0x00949581},
-       {0x339637c6, 0x9d73cf29, 0xa5642677, 0x8257d1a2, 0xcafd597c, 0xcb48f07f, 0x081435a3, 0x7a505010, 0xacbb9c39,
-        0xaaa45ce1, 0x7431b9c8, 0x013f2b13},
-       {0xd4710c0b, 0x9ef8bddb, 0x85047671, 0xb4c73188, 0x134695ba, 0x87a51d65, 0x022416dd, 0x67f3bc43, 0xcb2a157b,
-        0x21d965b2, 0x5ce4195d, 0x013a57e4},
-       {0xd2461368, 0xf2db3a9f, 0x3802aef2, 0x0595c232, 0x5ea85bd6, 0xa53d621a, 0xa34ee943, 0xce930fbc, 0x6b372bee,
-        0x1d216665, 0xa4535740, 0x009f0159},
-       {0x656bf68d, 0x73cf953a, 0xeac5c1d7, 0x50a5a5b5, 0xaa5355a9, 0x2697b2e1, 0x08de37d2, 0x6be70306, 0x44c5afab,
-        0x907f6976, 0xd4ec46b1, 0x0155cfa2},
-       {0x090e3e20, 0x034160c4, 0xf77a6fbb, 0xbc73cc59, 0x188e54f6, 0x437cd23b, 0x17e42614, 0x5a788edd, 0xebdc8eae,
-        0xf1ad4f54, 0x2f129bcd, 0x005d1440},
-       {0x4e269ee5, 0x5626c031, 0x0d1501ec, 0x5f97673e, 0x86d31c18, 0x4fe089bd, 0x62d1259a, 0x3e9fffcb, 0x1ff89d01,
-        0xe1898f32, 0x59d01a38, 0x00fa1331},
-       {0x38d427b1, 0xda80661b, 0xa814f14b, 0x1913027d, 0xcda4061d, 0xd3f61e24, 0x5da8fcb2, 0x9509e69d, 0x1f05e6d3,
-        0x0e7493a5, 0xa5c6bd06, 0x00dcb8db},
-       {0x61cff9ed, 0x88499d0a, 0x53718444, 0x0b317da2, 0x4b7eec5f, 0xc1624bfd, 0x5af10e6f, 0x6ffc3241, 0xd6c66ff2,
-        0x27d0edf3, 0x73ab0f4a, 0x013019b5},
-       {0x06027b24, 0x42dc7673, 0x3341b9e7, 0x018f8bbd, 0xa435f7e2, 0xd3b389d9, 0xea031176, 0x279739a5, 0x74c35801,
-        0x3555ca51, 0x049dcf87, 0x00748c30},
-       {0x81fe14de, 0x731b16f0, 0x333cc61a, 0x528d6ada, 0x5736dc15, 0x7ae87278, 0xc8bfd40c, 0xa94b9fd2, 0x299b0487,
-        0x714dd8ed, 0xf1a53233, 0x00642b62},
-       {0x5bc45170, 0x31270ddf, 0x7f72c758, 0x7efb6b06, 0xcf4973a8, 0x2eb9f2aa, 0xe556d234, 0xdcb534c9, 0x0e043fef,
-        0xf0b1a210, 0x54dda04e, 0x00e79c44},
-       {0x2d5f1bc2, 0x213b3f52, 0xfd933428, 0x9e115ba7, 0x434c9e2a, 0x7f77d57e, 0xcdb944ef, 0x47a78418, 0x699aa559,
-        0x8cb01cbb, 0xb064c4d7, 0x0075bf81},
-       {0x3fbfc66c, 0x0b6c2e65, 0x6fcab2f8, 0x7bece031, 0xb79dcd4d, 0x2ba7e325, 0xa5c6881b, 0x8c18f66a, 0x7283805a,
-        0x4d893e5a, 0xfc296bfe, 0x0107d3c5},
-       {0x948c881a, 0x53fbdbb4, 0x16803d18, 0xf27a9c14, 0xeddfafef, 0x8490f6c5, 0x3e57fa15, 0xfe068e1d, 0xd26b296b,
-        0xbe923119, 0x9fa377a1, 0x00d56016},
-       {0x6f5b2ad1, 0xb3bbaeb3, 0x11886a1c, 0x0efd4ba9, 0xdedb7083, 0x5911498f, 0x5bd0a90f, 0x0921fe19, 0x83d379cb,
-        0x38e05d4e, 0xb7ba3c73, 0x006b39e2},
-       {0xa55550ba, 0x61b560e4, 0xe7288461, 0xd9ac545b, 0xc6e3e282, 0xde8d2826, 0x7e49dd2c, 0x9e87a310, 0xc43080b7,
-        0xf2edfc44, 0x95b7d300, 0x012b4875},
-       {0x27591e60, 0x4048ddc3, 0xc5d21791, 0xb77c9738, 0x49826bea, 0xf2f82033, 0x42f97e95, 0xf60bb703, 0x5966139d,
-        0xef8f6f16, 0xc0e95e39, 0x00327618},
-       {0x441e395f, 0xf9059c8f, 0xbd087238, 0x29eab35f, 0x7dee5ff1, 0x5d4abeff, 0x771e60e9, 0x7222499b, 0x7ac324a2,
-        0xb70c1ea3, 0x0da51ce8, 0x015b3af9},
-       {0xe9a70026, 0xf7aa576b, 0x01c4a126, 0xb28733ef, 0xa3307647, 0x06b8e768, 0xe12588ce, 0x115500e1, 0x6c9f9b1d,
-        0x7e8dd6b9, 0x6ec020b3, 0x014d091e},
-       {0x8e5bbc8d, 0xd318265d, 0x141bee9b, 0x70b460ba, 0x1aa9df5b, 0x145dd6a6, 0xe3478cb3, 0xd9da2548, 0x7b509387,
-        0x47250509, 0xe967973c, 0x00de53d3},
-       {0xd2aa57b8, 0x5ff4399c, 0xa6ae9b07, 0x90360194, 0x6cfcdb7a, 0x68979991, 0x64e56abb, 0xf517467c, 0xad7a6573,
-        0x44227491, 0xa35ebf55, 0x0001da0b},
-       {0x4d80f6da, 0xd8b22d5a, 0x10ee1a06, 0x6e7b2bfb, 0x17faeac0, 0xac8d97e5, 0x7a12c923, 0x8b75540b, 0x5b42ce02,
-        0xa2787368, 0xe98d9998, 0x008d30a5},
-       {0x9dc292bb, 0xee29c02a, 0xc5b7e1c9, 0x9e7ea016, 0x9a908e5f, 0x62daf95d, 0x3e98eae9, 0x80a71c61, 0xfdda3bba,
-        0x2d514723, 0x068ef829, 0x00f65844},
-       {0x185b1ad6, 0xf62fdfa4, 0xf90ccbe6, 0x2ae7f104, 0x972ce78e, 0xfa435fb6, 0x45e59f91, 0x53a75d3c, 0x2f320b7a,
-        0x7290cac2, 0xe7cb5108, 0x01a2022a},
-       {0xd59dda24, 0xcf0a15be, 0xf2ec72b4, 0xbc77f6d4, 0x96c31202, 0xa8df0caf, 0xbb4f8842, 0xb95429c0, 0xd0087306,
-        0xb989b210, 0x5571e9f0, 0x002b1694},
-       {0x67ae536e, 0x7e84d4b5, 0xc8fb9b80, 0x3a920871, 0x1948ee86, 0x1a82df2b, 0xb3c66ed3, 0xdef79467, 0xef64d05a,
-        0x58fd84f2, 0xd999f400, 0x00c6d5b7},
-       {0x81ee0d53, 0x7639f9a2, 0xb5747565, 0x8ade807d, 0xe6235609, 0xfd9d6266, 0x53730f18, 0xea1948a3, 0xd890142e,
-        0xa356108a, 0xe3e8a723, 0x00a48ac6},
-       {0xd0ca5e04, 0x531c4b83, 0x2ba0a328, 0xff35ced6, 0xa4e563aa, 0x01613079, 0x1442dcd1, 0x6f52b3a3, 0x9e19b0a6,
-        0x813b4616, 0x9536db26, 0x004828c5},
-       {0x0bce1b4e, 0x8a9321a9, 0xae85d6ff, 0xb9759dbe, 0x5cb206e0, 0x1ce1d522, 0x35a1607a, 0x87df044f, 0x94e1329a,
-        0x2ebabee7, 0x73586cc9, 0x01a73170},
-       {0x3dd667f3, 0x69824754, 0x28fd63a2, 0x61a081a7, 0x99499385, 0x0b9f6d2e, 0x5c253e16, 0x6d45622b, 0x765a7f5f,
-        0xcd672e4d, 0x7150d847, 0x01182798},
-       {0x2742d2f6, 0x0af0bfd2, 0x3a02631d, 0x93616956, 0xac8a2203, 0x32dae751, 0x85cf4e2d, 0xea4ffbe7, 0x7dba6eb9,
-        0x673424f4, 0x61f4060d, 0x002ec230},
-       {0x5a5b5c2b, 0x226293ca, 0x0684dbc9, 0xbc0ca23e, 0x7d637c4f, 0x4510cf3a, 0x9b2f4a52, 0x7869c488, 0x2fd73a53,
-        0xec009b90, 0xa8c99cca, 0x003499d6},
-       {0xfd745afc, 0x9da60b0a, 0x41c5362e, 0xff0769ec, 0xfa9fd8ee, 0x487621e9, 0xab04558f, 0x138910d1, 0xc1ed03ce,
-        0x870903cf, 0xed3ffb51, 0x002c1cfa},
-       {0x42870c46, 0x271b1ff3, 0x13b4b491, 0x1e0a9cd1, 0x3c55c65e, 0x2d58cb1a, 0x74756f6e, 0xa6e12c32, 0x2e313bc4,
-        0xf774a43d, 0xcc386ffc, 0x00ca156d},
-       {0x4a67741c, 0x588f79b6, 0xc3590b63, 0xc0ae78b5, 0xc3576385, 0xad0bb97d, 0xb8473137, 0x0583dd49, 0x515d8604,
-        0xb31d9631, 0xd3ba3b12, 0x015337bc},
-       {0x8a458e8c, 0x976a14f5, 0xc3a26ae8, 0xc90809b4, 0x089acf15, 0x270a1575, 0x5013d4b1, 0x614a0d25, 0x6d09901e,
-        0x1314e076, 0xf208945e, 0x0022f414},
-       {0xc563b9a1, 0x7eca603c, 0x06fe0bc3, 0x06df0a43, 0x0ddff8c6, 0xb44d994a, 0x4512a3d4, 0x40fbe05b, 0x8aeffc9b,
-        0x30f15248, 0x05198a80, 0x0036a92e}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0x8508c000, 0x30000000, 0x170b5d44, 0xba094800, 0x1ef3622f, 0x00f5138f, 0x1a22d9f3, 0x6ca1493b,
-        0xc63b05c0, 0x17c510ea, 0x01ae3a46},
-       {0x0ec6e39e, 0x1691ea13, 0x700d8272, 0x7db2d8ea, 0x769e389d, 0x620d1860, 0xf62334cd, 0xda1f40fd, 0x52278a89,
-        0x0575d0e5, 0x9e5fd920, 0x00463005},
-       {0x93997f11, 0x9403412c, 0xdfb2323f, 0x845557b3, 0x2d50c7fc, 0x66f2eaaa, 0xc103f92f, 0x992358fb, 0x5d7a3179,
-        0x01d60217, 0xd2af5da0, 0x0077b354},
-       {0xc1000ea4, 0x7ac2ca7a, 0x7f8d9495, 0x937db751, 0x0de62931, 0x401b3873, 0x980129ba, 0x59be7529, 0xa545a303,
-        0x2ba8f85d, 0xb6705512, 0x00573e3a},
-       {0x2c1b22e6, 0xb55712f9, 0x0f91cddd, 0x66cfc0f3, 0x8bb345d8, 0x8d5fcd42, 0x86c0abc3, 0x61e4cf98, 0x432fe8f3,
-        0x93556354, 0xad005fb6, 0x00ff87d5},
-       {0x7aba560e, 0x05065a97, 0x7918b9db, 0x333ff005, 0xdf6be708, 0x03938ae1, 0x7410a77b, 0x922d3376, 0x03a15063,
-        0xa5aeaa56, 0x4aea89e5, 0x01542cb6},
-       {0xe4d6a772, 0x61a6a2d6, 0x6e6239a7, 0xc18c9ef7, 0x04cac70f, 0x8772bb3f, 0x16c5916b, 0x8bbb4185, 0x46335dc0,
-        0x4aa656e2, 0x842c1664, 0x008187ac},
-       {0xdd4e93c5, 0xa002ea0a, 0x07458704, 0xb40a45e8, 0xbaa65f2a, 0xee9ee3ea, 0x8f3b8a87, 0xeffa4f9e, 0x95b5feba,
-        0xb6e03897, 0x81751c63, 0x003c41de},
-       {0x13043a4a, 0x50221a3b, 0xda73331a, 0x6537fca8, 0x8e85077c, 0x8b74cef4, 0x0e5bbe67, 0x65705341, 0xefa22d23,
-        0xf0f56caa, 0xd1865d98, 0x001f8eb5},
-       {0x3e26a605, 0xd9af8944, 0x6970166f, 0xad0efb6e, 0x2c7464ec, 0xc16d7972, 0xf788281b, 0xe0de4b04, 0xaa878b0e,
-        0x0c049e55, 0x63e2e7cd, 0x0135383a},
-       {0x6f6893f7, 0x6b12c42e, 0x44bbbf63, 0x831f38c0, 0x191be6c9, 0xa57797d4, 0x447475cb, 0x6af7f695, 0x4b8be189,
-        0x3295e9e7, 0x350d0aad, 0x00a9a32b},
-       {0x7656ef1d, 0xc2243f86, 0xf4211219, 0x3e4c3bc3, 0x3c9a3d21, 0xaa4db6e0, 0xe8a4c946, 0x29ac638a, 0xa4cf856e,
-        0x21449f8b, 0x7d4c9c67, 0x018cf097},
-       {0x6a8e0139, 0x18e472a2, 0xd6b1c835, 0xcc7c80fd, 0x6546fc0a, 0x1f760883, 0x4ea3417c, 0x5bcfc1fb, 0xe9acb8b0,
-        0x52c9a29b, 0xd9f265a2, 0x01a6d8b2},
-       {0xebb83ac0, 0x95eb1dc8, 0x9f390cf2, 0x1e8d70f5, 0xb0d85145, 0xf9e4955d, 0x89720ee1, 0xe9690d30, 0x50fc879f,
-        0x629972a5, 0x69ccd670, 0x00456e23},
-       {0x83f38be4, 0xfbfb11a1, 0x388e6726, 0xb90a19b9, 0xc860d62c, 0x3fc10bc7, 0xc3c4e575, 0xc9fe043e, 0x7396d780,
-        0x67aeff74, 0x01cadaee, 0x019059fa},
-       {0xfd581be8, 0x43506d6e, 0x018b1b76, 0xf09563e6, 0xe87f9d80, 0x5cd193b2, 0x0a933402, 0x18ba3260, 0x50524c77,
-        0x4de839d9, 0xd90315ce, 0x0018c2ed},
-       {0xa737701d, 0xf900eb81, 0x995e6672, 0x6874c90e, 0xa495900b, 0x69ade94a, 0xd07bd4b1, 0xd5f358e7, 0x6f88e8e4,
-        0xbd437e9d, 0x1d6b88cf, 0x0130d706},
-       {0xfc29b95f, 0x064629bd, 0xb546585c, 0x0a897bff, 0x54a80d9a, 0x856c8d4f, 0x944568ff, 0x85410cc4, 0x59fc4370,
-        0xc1978c65, 0xc668dc52, 0x017c86c8},
-       {0xf6109131, 0x65cecd55, 0x7d2f52e5, 0x6d7e892e, 0xb90b2403, 0xe9a09007, 0xae0a060d, 0x92ca9aac, 0xa22b1e96,
-        0x5ce1cc4f, 0x45201e6f, 0x012eb33c},
-       {0x20d1aac5, 0x9d2cb4cf, 0xded22997, 0x3e4a1e77, 0x07fae2e2, 0x09d692f7, 0xd49bdcbe, 0x6a6aa4f8, 0x09c01cab,
-        0xa8e21ead, 0x6b03b72e, 0x01a19e81},
-       {0x935650ca, 0xf3d94623, 0x2ffd937e, 0x4a688a46, 0xa622b139, 0xf55fd53a, 0x7a1a1e40, 0x227406aa, 0x9a3fea60,
-        0x40dd4504, 0x1edbb584, 0x00fc2332},
-       {0xf28db3fc, 0x9707402f, 0xc28593f1, 0x3d898bd7, 0xb30effcd, 0xcaee2dfd, 0x4fb6ec9d, 0xff1b0790, 0x09ed1120,
-        0x9cb0597e, 0xb78d15e9, 0x005c73a5},
-       {0xb0a8a3b9, 0x739a4c2e, 0xc57196ae, 0x083bde21, 0xba602f29, 0x247eb070, 0x1c2c7132, 0x4ba1dd6a, 0xe2187c6c,
-        0x4ce59fb6, 0x606880b1, 0x0014a7b5},
-       {0x484baf56, 0xdd0eccab, 0x4541b101, 0xe6c80eaf, 0xf7964f64, 0x35b8a558, 0xc50ccf94, 0xb3b824d4, 0x21c71aeb,
-        0xe1f6b4c8, 0x23031df0, 0x01a8a647},
-       {0x592a9620, 0x5338dc01, 0xd94a401b, 0xb217f96d, 0xf830b00e, 0xfefb6601, 0xafd3dee4, 0x1ec061b5, 0x05a199bd,
-        0x0d5d4d3c, 0xc8489913, 0x0196c768},
-       {0x1f980ca0, 0x4acb430e, 0x71c6821c, 0x8973a3cc, 0xb3e9aa75, 0x74414c20, 0x0c13f042, 0x79212a5f, 0x375c705b,
-        0x5c44d226, 0x29439af2, 0x000a2fdd},
-       {0xa387b60c, 0xf01901e6, 0x4561ff3d, 0xa7b1b7dc, 0x0558e085, 0x5d82d374, 0xf2bc1d29, 0x519298e5, 0x3d332207,
-        0x0ad719a8, 0xea19a807, 0x0150a138},
-       {0x9deb8e06, 0x7c6b3eb1, 0x28206b6c, 0x3a8f53c4, 0x7fed1065, 0x039f575f, 0x40c1f898, 0x31be74ba, 0x790ac003,
-        0x76db938e, 0x5508c5e4, 0x0096d5e1},
-       {0xb83f8358, 0x3e940e0e, 0x372a4b8b, 0x204d80e0, 0xa820b2ec, 0x956454b2, 0x2cc8078c, 0x8e2cb3d4, 0xc6f81363,
-        0xdd0d3e12, 0x49041a64, 0x0052f327},
-       {0x2aec0be2, 0x37ca2eb7, 0x555cc652, 0x05093570, 0xd2588d31, 0xe62f1adb, 0x798be240, 0x2fd2518e, 0x0ff6b579,
-        0x9302d4e3, 0x6ee95e5d, 0x0025ca57},
-       {0x233eed68, 0xcc664858, 0xece3a327, 0x600ca1ac, 0x93a2e34f, 0x330d1102, 0xdb5e3bb4, 0xc84ab55f, 0xe4d5576e,
-        0x5179c101, 0x0938f714, 0x00efb20e},
-       {0xfdddaf5c, 0x907f96e7, 0x1ffe49da, 0x348dab77, 0xc14ab779, 0x3eca44ad, 0x4cdc5d98, 0xe9b10b2e, 0xa95c5a36,
-        0x65a25d16, 0x6e616518, 0x00c9f759},
-       {0x7a5aff62, 0x9497d331, 0xb57cd01d, 0x21896195, 0x6c7ba745, 0xe09e22f7, 0x5a7acff0, 0xcc9f1064, 0xc93c46b0,
-        0x7b867cdf, 0x23eba5ae, 0x01a05dcb},
-       {0x4dcc71f4, 0xa56a8e33, 0xcbebdba2, 0xc480b083, 0x36ea43af, 0x748448fa, 0xe7859f3c, 0xee9b4b0e, 0x5af41919,
-        0x9ab2bb09, 0x65caa0ea, 0x0127262d},
-       {0x352a05cc, 0x77c7d12f, 0xdc7160c9, 0xb91ca5be, 0x5a3feda0, 0x245106da, 0x7669f7cd, 0xfd45012d, 0xdc5489fa,
-        0xc4774629, 0x2872daa0, 0x00241273},
-       {0x0d3e0b0b, 0x1838ae6f, 0xff67fc2c, 0x7fcc9b21, 0x23956100, 0xaedca59e, 0x1e79aa4b, 0x572ed634, 0xc7f0673c,
-        0xaeeda160, 0xc8047256, 0x00360e2c},
-       {0xe05044f9, 0xec5e4514, 0x7ec9b4ef, 0xe915b7e7, 0x9c4bec48, 0x9fb78cd8, 0xa38d95a3, 0xd7b84113, 0xb86fd119,
-        0x7be64440, 0xe4f9e70a, 0x009e3a60},
-       {0xc7435591, 0xc61cc546, 0xe5e94dc4, 0xea99a96f, 0xdb8ff17d, 0x5b10e2b4, 0x3dd0ff10, 0x13f8fb9d, 0xe118b9e9,
-        0xcbb1c0ce, 0x7ebf8a0d, 0x00b37258},
-       {0xce5943e7, 0xd44fdb9d, 0x79fa927a, 0xcb7d41ea, 0xdcee72ca, 0x9a4bcebf, 0x11634905, 0x2317799d, 0x584055ac,
-        0x3f1c302e, 0xdc2d0017, 0x013ef021},
-       {0xa78a1578, 0x345cb052, 0x5961b8fe, 0x1ed4d48a, 0x74a5e2af, 0x5858e93c, 0x0fd17e9f, 0xaf643f0a, 0x79d94009,
-        0x61530753, 0xde7b2f53, 0x010a3393},
-       {0x813925df, 0x548b1d28, 0xca3e79b6, 0xabab3a4e, 0x7e51071a, 0xb3c9c068, 0x6c5fcedb, 0x8014e879, 0x95d9facc,
-        0x3ba5db77, 0x7f5c3d2f, 0x0105c419},
-       {0x26bc1104, 0xbb9cbd28, 0xe03cc852, 0x27f09abb, 0x22e5be61, 0x02763b4a, 0xb94fa254, 0xa3940542, 0xff34c35f,
-        0xcf058850, 0x1482533c, 0x019f538f},
-       {0xb3f42de9, 0xf2126047, 0xbeb0a1b8, 0xdb0451c4, 0x9aabc291, 0x1a945bc0, 0x7fe3a6f2, 0x13d08312, 0x390e1c07,
-        0xd8fb13f1, 0x6b30562b, 0x005a41c4},
-       {0xe8b3d5dd, 0x1c60fcc5, 0x75b3a464, 0x5d7babba, 0xf3989910, 0x0d9f52c7, 0x9beec571, 0x464a2840, 0x79689d4b,
-        0x139c496f, 0x099e64c4, 0x0022c6a3},
-       {0x023e0cd1, 0x9df6c2d5, 0xa6b747de, 0x8e23def9, 0x90da6876, 0x7bc83eee, 0xc88bb007, 0xdaeac352, 0x68bb6a7f,
-        0x45cabb6f, 0x94697b34, 0x001e7154},
-       {0x0203d905, 0xffcee91d, 0xc99df56d, 0xd878ee01, 0x210d754c, 0xa0e882f9, 0x7d0aec6a, 0x26c96db8, 0x8ff7afe4,
-        0x46e2e145, 0x54749283, 0x015cd1b0}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x00000001, 0x42846000, 0x18000000, 0x0b85aea2, 0xdd04a400, 0x8f79b117, 0x807a89c7, 0x8d116cf9, 0x3650a49d,
-        0x631d82e0, 0x0be28875, 0x00d71d23},
-       {0x00000001, 0x63c69000, 0x24000000, 0x114885f3, 0xcb86f600, 0x573689a3, 0x40b7ceab, 0x539a2376, 0x5178f6ec,
-        0x14ac4450, 0x91d3ccb0, 0x0142abb4},
-       {0x00000001, 0x7467a800, 0xaa000000, 0x1429f19b, 0xc2c81f00, 0x3b14f5e9, 0xa0d6711d, 0xb6de7eb4, 0x5f0d2013,
-        0x6d73a508, 0x54cc6ecd, 0x017872fd},
-       {0x00000001, 0x7cb83400, 0xed000000, 0x159aa76f, 0xbe68b380, 0x2d042c0c, 0xd0e5c256, 0x6880ac53, 0x65d734a7,
-        0x19d75564, 0xb648bfdc, 0x019356a1},
-       {0x00000001, 0x80e07a00, 0x0e800000, 0x1653025a, 0x3c38fdc0, 0xa5fbc71e, 0x68ed6af2, 0x4151c323, 0x693c3ef1,
-        0x70092d92, 0xe706e863, 0x01a0c873},
-       {0x00000001, 0x82f49d00, 0x1f400000, 0x16af2fcf, 0xfb2122e0, 0xe27794a6, 0x34f13f40, 0x2dba4e8b, 0x6aeec416,
-        0x1b2219a9, 0xff65fca7, 0x01a7815c},
-       {0x00000001, 0x83feae80, 0xa7a00000, 0x16dd4689, 0x5a953570, 0x00b57b6b, 0x1af32968, 0xa3ee943f, 0xebc806a8,
-        0xf0ae8fb4, 0x8b9586c8, 0x01aaddd1},
-       {0x00000001, 0x8483b740, 0xebd00000, 0x16f451e6, 0x8a4f3eb8, 0x8fd46ecd, 0x0df41e7b, 0xdf08b719, 0xac34a7f1,
-        0xdb74caba, 0xd1ad4bd9, 0x01ac8c0b},
-       {0x00000001, 0x84c63ba0, 0x8de80000, 0x16ffd795, 0xa22c435c, 0x5763e87e, 0x07749905, 0x7c95c886, 0x8c6af896,
-        0x50d7e83d, 0xf4b92e62, 0x01ad6328},
-       {0x00000001, 0x84e77dd0, 0xdef40000, 0x17059a6c, 0x2e1ac5ae, 0x3b2ba557, 0x8434d64a, 0xcb5c513c, 0xfc8620e8,
-        0x8b8976fe, 0x863f1fa6, 0x01adceb7},
-       {0x00000001, 0x84f81ee8, 0x877a0000, 0x17087bd8, 0x741206d7, 0xad0f83c3, 0xc294f4ec, 0xf2bf9597, 0xb493b511,
-        0xa8e23e5f, 0xcf021848, 0x01ae047e},
-       {0x00000001, 0x85006f74, 0x5bbd0000, 0x9709ec8e, 0x970da76b, 0xe60172f9, 0x61c5043d, 0x867137c5, 0x109a7f26,
-        0xb78ea210, 0x73639499, 0x01ae1f62},
-       {0x00000001, 0x850497ba, 0x45de8000, 0xd70aa4e9, 0xa88b77b5, 0x827a6a94, 0x315d0be6, 0xd04a08dc, 0x3e9de430,
-        0x3ee4d3e8, 0x459452c2, 0x01ae2cd4},
-       {0x00000001, 0x8506abdd, 0xbaef4000, 0xf70b0116, 0x314a5fda, 0xd0b6e662, 0x99290fba, 0xf5367167, 0x559f96b5,
-        0x828fecd4, 0x2eacb1d6, 0x01ae338d},
-       {0x80000001, 0x8507b5ee, 0x7577a000, 0x870b2f2d, 0xf5a9d3ed, 0xf7d52448, 0x4d0f11a4, 0x87aca5ad, 0x61206ff8,
-        0xa465794a, 0xa338e160, 0x01ae36e9},
-       {0x40000001, 0x85083af7, 0xd2bbd000, 0xcf0b4638, 0x57d98df6, 0x0b64433c, 0x2702129a, 0xd0e7bfd0, 0x66e0dc99,
-        0xb5503f85, 0xdd7ef925, 0x01ae3897},
-       {0xa0000001, 0x85087d7b, 0x815de800, 0x730b51be, 0x08f16afb, 0x952bd2b6, 0x93fb9314, 0x75854ce1, 0xe9c112ea,
-        0x3dc5a2a2, 0xfaa20508, 0x01ae396e},
-       {0xd0000001, 0x85089ebd, 0x58aef400, 0xc50b5781, 0xe17d597d, 0xda0f9a72, 0x4a785351, 0xc7d4136a, 0xab312e12,
-        0x82005431, 0x89338af9, 0x01ae39da},
-       {0xe8000001, 0x8508af5e, 0xc4577a00, 0xee0b5a62, 0x4dc350be, 0x7c817e51, 0xa5b6b370, 0xf0fb76ae, 0x0be93ba6,
-        0x241dacf9, 0x507c4df2, 0x01ae3a10},
-       {0x74000001, 0x8508b7af, 0x7a2bbd00, 0x828b5bd3, 0x83e64c5f, 0xcdba7040, 0xd355e37f, 0x058f2850, 0xbc454271,
-        0x752c595c, 0x3420af6e, 0x01ae3a2b},
-       {0xba000001, 0x8508bbd7, 0xd515de80, 0xcccb5c8b, 0x1ef7ca2f, 0x7656e938, 0xea257b87, 0x0fd90121, 0x947345d6,
-        0x9db3af8e, 0xa5f2e02c, 0x01ae3a38},
-       {0xdd000001, 0x8508bdeb, 0x028aef40, 0xf1eb5ce8, 0xec808917, 0x4aa525b3, 0x758d478b, 0x94fded8a, 0x808a4788,
-        0xb1f75aa7, 0x5edbf88b, 0x01ae3a3f},
-       {0xee800001, 0x8508bef5, 0x194577a0, 0x047b5d16, 0xd344e88c, 0x34cc43f1, 0xbb412d8d, 0xd79063be, 0xf695c861,
-        0x3c193033, 0xbb5084bb, 0x01ae3a42},
-       {0xf7400001, 0x8508bf7a, 0x24a2bbd0, 0x0dc35d2d, 0xc6a71846, 0x29dfd310, 0xde1b208e, 0x78d99ed8, 0x319b88ce,
-        0x012a1afa, 0x698acad3, 0x01ae3a44},
-       {0x7ba00001, 0x8508bfbd, 0xaa515de8, 0x12675d38, 0x40583023, 0xa4699aa0, 0xef881a0e, 0xc97e3c65, 0x4f1e6904,
-        0xe3b2905d, 0x40a7edde, 0x01ae3a45},
-       {0xbdd00001, 0x8508bfde, 0x6d28aef4, 0x94b95d3e, 0xfd30bc11, 0xe1ae7e67, 0x783e96ce, 0xf1d08b2c, 0xdddfd91f,
-        0xd4f6cb0e, 0xac367f64, 0x01ae3a45},
-       {0x5ee80001, 0x8508bfef, 0x4e94577a, 0xd5e25d41, 0xdb9d0208, 0x0050f04b, 0xbc99d52f, 0x85f9b28f, 0xa540912d,
-        0xcd98e867, 0xe1fdc827, 0x01ae3a45},
-       {0xaf740001, 0x8508bff7, 0xbf4a2bbd, 0x7676dd42, 0xcad32504, 0x0fa2293d, 0x5ec7745f, 0x500e4641, 0x08f0ed34,
-        0x49e9f714, 0xfce16c89, 0x01ae3a45},
-       {0xd7ba0001, 0x0508bffb, 0x77a515df, 0x46c11d43, 0xc26e3682, 0x174ac5b6, 0x2fde43f7, 0xb518901a, 0x3ac91b37,
-        0x08127e6a, 0x0a533eba, 0x01ae3a46},
-       {0xebdd0001, 0xc508bffd, 0xd3d28aef, 0x2ee63d43, 0x3e3bbf41, 0x1b1f13f3, 0x9869abc3, 0x679db506, 0x53b53239,
-        0x6726c215, 0x110c27d2, 0x01ae3a46},
-       {0xf5ee8001, 0x2508bffe, 0x01e94578, 0xa2f8cd44, 0x7c2283a0, 0x1d093b11, 0xccaf5fa9, 0x40e0477c, 0xe02b3dba,
-        0x96b0e3ea, 0x14689c5e, 0x01ae3a46},
-       {0x7af74001, 0x5508bfff, 0x18f4a2bc, 0x5d021544, 0x9b15e5d0, 0x1dfe4ea0, 0xe6d2399c, 0xad8190b7, 0xa666437a,
-        0xae75f4d5, 0x1616d6a4, 0x01ae3a46},
-       {0xbd7ba001, 0x6d08bfff, 0x247a515e, 0x3a06b944, 0x2a8f96e8, 0x9e78d868, 0x73e3a695, 0xe3d23555, 0x0983c65a,
-        0xba587d4b, 0x16edf3c7, 0x01ae3a46},
-       {0xdebdd001, 0x7908bfff, 0x2a3d28af, 0x28890b44, 0xf24c6f74, 0x5eb61d4b, 0x3a6c5d12, 0xfefa87a4, 0xbb1287ca,
-        0x4049c185, 0x17598259, 0x01ae3a46},
-       {0xef5ee801, 0xff08bfff, 0x2d1e9457, 0x1fca3444, 0xd62adbba, 0xbed4bfbd, 0x9db0b850, 0x0c8eb0cb, 0x13d9e883,
-        0x034263a3, 0x178f49a2, 0x01ae3a46},
-       {0xf7af7401, 0x4208bfff, 0x2e8f4a2c, 0x1b6ac8c4, 0xc81a11dd, 0xeee410f6, 0x4f52e5ef, 0x1358c55f, 0xc03d98df,
-        0x64beb4b1, 0x17aa2d46, 0x01ae3a46},
-       {0xfbd7ba01, 0x6388bfff, 0x2f47a516, 0x993b1304, 0x4111acee, 0x86ebb993, 0x2823fcbf, 0x16bdcfa9, 0x166f710d,
-        0x957cdd39, 0x17b79f18, 0x01ae3a46},
-       {0xfdebdd01, 0x7448bfff, 0x2fa3d28b, 0x58233824, 0x7d8d7a77, 0x52ef8de1, 0x148c8827, 0x187054ce, 0xc1885d24,
-        0xaddbf17c, 0x17be5801, 0x01ae3a46},
-       {0xfef5ee81, 0xfca8bfff, 0x2fd1e945, 0xb7974ab4, 0x9bcb613b, 0x38f17808, 0x8ac0cddb, 0x99499760, 0x9714d32f,
-        0x3a0b7b9e, 0x17c1b476, 0x01ae3a46},
-       {0xff7af741, 0x40d8bfff, 0x2fe8f4a3, 0xe75153fc, 0x2aea549d, 0x2bf26d1c, 0xc5daf0b5, 0x59b638a9, 0x81db0e35,
-        0x802340af, 0x17c362b0, 0x01ae3a46},
-       {0xffbd7ba1, 0xe2f0bfff, 0x2ff47a51, 0xff2e58a0, 0xf279ce4e, 0x2572e7a5, 0x63680222, 0x39ec894e, 0xf73e2bb8,
-        0xa32f2337, 0x17c439cd, 0x01ae3a46},
-       {0xffdebdd1, 0x33fcbfff, 0x2ffa3d29, 0x8b1cdaf2, 0xd6418b27, 0xa23324ea, 0xb22e8ad8, 0xaa07b1a0, 0x31efba79,
-        0x34b5147c, 0x17c4a55c, 0x01ae3a46},
-       {0xffef5ee9, 0xdc82bfff, 0x2ffd1e94, 0xd1141c1b, 0x48256993, 0xe093438d, 0xd991cf33, 0x621545c9, 0x4f4881da,
-        0x7d780d1e, 0x17c4db23, 0x01ae3a46},
-       {0xfff7af75, 0xb0c5bfff, 0xaffe8f4a, 0xf40fbcaf, 0x811758c9, 0x7fc352de, 0x6d437161, 0xbe1c0fde, 0x5df4e58a,
-        0x21d9896f, 0x17c4f607, 0x01ae3a46},
-       {0xfffbd7bb, 0x9ae73fff, 0xefff47a5, 0x058d8cf9, 0x1d905065, 0x4f5b5a87, 0xb71c4278, 0xec1f74e8, 0xe54b1762,
-        0xf40a4797, 0x17c50378, 0x01ae3a46},
-       {0xfffdebde, 0x0ff7ffff, 0x0fffa3d3, 0x8e4c751f, 0x6bcccc32, 0xb7275e5b, 0xdc08ab03, 0x0321276d, 0x28f6304f,
-        0xdd22a6ac, 0x17c50a31, 0x01ae3a46}}};
+    static constexpr storage<12> rou = {0xc563b9a1, 0x7eca603c, 0x06fe0bc3, 0x06df0a43, 0x0ddff8c6, 0xb44d994a,
+                                        0x4512a3d4, 0x40fbe05b, 0x8aeffc9b, 0x30f15248, 0x05198a80, 0x0036a92e};
+    TWIDDLES(modulus, rou)

    // nonresidue to generate the extension field
    static constexpr uint32_t nonresidue = 5;
--- a/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
@@ -4,193 +4,17 @@

 #include "fields/storage.cuh"
 #include "fields/field.cuh"
-#include "fields/quadratic_extension.cuh"
+#include "fields/params_gen.cuh"

 namespace bls12_377 {
  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 47;
-    static constexpr unsigned modulus_bit_count = 253;
-    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<8> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                           0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    PARAMS(modulus)

-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
-                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
-                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
-                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
-                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
-      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
-      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
-      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
-
-    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
-                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
-                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
-                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
-       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
-       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
-       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
-       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
-       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
-       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
-       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
-       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
-       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
-       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
-       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
-       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
-       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
-       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
-       {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
-       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
-       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
-       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
-       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
-       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
-       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
-       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
-       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
-       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
-       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
-       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
-       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
-       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
-       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
-       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
-       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
-       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
-       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
-       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
-       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
-       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
-       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
-       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
-       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
-       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
-       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
-       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
-       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
-       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
-       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
-       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
-       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
-       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
-       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
-       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
-       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
-       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
-       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
-       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
-       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
-       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
-       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
-       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
-       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
-       {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
-       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
-       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
-       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
-       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
-       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
-       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
-       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
-       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
-       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
-       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
-       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
-       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
-       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
-       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
-       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
-       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
-       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
-       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
-       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
-       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
-       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
-       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
-       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
-       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
-       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
-       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
-       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
-       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
-       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
-       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
-       {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06},
-       {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2},
-       {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08},
-       {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33},
-       {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9},
-       {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93},
-       {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9},
-       {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab},
-       {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85},
-       {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1},
-       {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8},
-       {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003},
-       {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1},
-       {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007},
-       {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3},
-       {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08},
-       {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3},
-       {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309},
-       {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433},
-       {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9},
-       {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513},
-       {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539},
-       {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b},
-       {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555},
-       {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559},
-       {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c},
-       {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d},
-       {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
-       {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
-       {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
-       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
-       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
-       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
-       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
-       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
-       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
-       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
-       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
-       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
-       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
-       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
-       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
-       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
-       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
-       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
-       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
+    static constexpr storage<8> rou = {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3,
+                                       0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6};
+    TWIDDLES(modulus, rou)
  };

  /**
--- a/icicle/include/fields/snark_fields/bls12_381_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_base.cuh
@@ -3,54 +3,14 @@
 #define BLS12_381_BASE_PARAMS_H

 #include "fields/storage.cuh"
+#include "fields/params_gen.cuh"

 namespace bls12_381 {
  struct fq_config {
-    static constexpr unsigned limbs_count = 12;
-    static constexpr unsigned modulus_bit_count = 381;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
-                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
-    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
-                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
-                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
-    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
-                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
-                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
-    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
-                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
-                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
-      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
-      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
-      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
-      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
-      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
-      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
-    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
-                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
-                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
-                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
-                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
-                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
+    static constexpr storage<12> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0,
+                                            0xf38512bf, 0x64774b84, 0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
+    PARAMS(modulus)
+
    // nonresidue to generate the extension field
    static constexpr uint32_t nonresidue = 1;
    // true if nonresidue is negative
@@ -58,4 +18,4 @@ namespace bls12_381 {
  };
 } // namespace bls12_381

-#endif
+#endif
--- a/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
@@ -4,148 +4,17 @@

 #include "fields/storage.cuh"
 #include "fields/field.cuh"
-#include "fields/quadratic_extension.cuh"
+#include "fields/params_gen.cuh"

 namespace bls12_381 {
  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 32;
-    static constexpr unsigned modulus_bit_count = 255;
-    static constexpr unsigned num_of_reductions = 2;
+    static constexpr storage<8> modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402,
+                                           0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
+    PARAMS(modulus)

-    static constexpr storage<limbs_count> modulus = {0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402,
-                                                     0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0xfffffffe, 0xfffcb7fd, 0xa77b4805,
-                                                       0x1343b00a, 0x6673b010, 0x533afa90, 0xe7db4ea6};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0xfffffffc, 0xfff96ffb, 0x4ef6900b,
-                                                       0x26876015, 0xcce76020, 0xa675f520, 0xcfb69d4c};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0x00000000, 0x0001a401, 0xac425bfd,
-                                                         0xf65e27fa, 0xccc627f7, 0xd66282b7, 0x8c1258ac};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0xfffffffe, 0xfffcb7fe, 0xa77e9007, 0x1cdbb005, 0x698ae002, 0x5433f7b8, 0x48aa415e,
-      0x4aa9c661, 0xc2611f6f, 0x59934a1d, 0x0e9593f9, 0xef2cc20f, 0x520c13db, 0xf4bc2778, 0x347f60f3};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc,
-      0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0xfffffff8, 0xfff2dffb, 0x9dfa401f, 0x736ec016, 0xa62b8008, 0x50cfdee1, 0x22a90579,
-      0x2aa71985, 0x09847dbd, 0x664d2877, 0x3a564fe5, 0xbcb3083c, 0x48304f6f, 0xd2f09de1, 0xd1fd83cf};
-
-    static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad,
-                                               0xc1f823b4, 0x0e2d772d, 0x7fb78ddf, 0x8d54253b};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xfffffffe, 0x00000001, 0x00034802, 0x5884b7fa,
-                                                          0xecbc4ff5, 0x998c4fef, 0xacc5056f, 0x1824b159};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0xfe75c040, 0x13f75b69, 0x09dc705f, 0xab6fca8f,
-                                                              0x4f77266a, 0x7204078a, 0x30009d57, 0x1bbe8693};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753},
-       {0x00000000, 0x00010000, 0x76030000, 0xec030002, 0x760304d0, 0x8d51ccce, 0x00000000, 0x00000000},
-       {0x688bc087, 0x8dd702cb, 0x78eaa4fe, 0xa0328240, 0x98ca5b22, 0xa733b23a, 0x25a31660, 0x3f96405d},
-       {0x0411fe73, 0x95df4b36, 0xebc1e1bb, 0x1ef4e672, 0x60afca4a, 0x6e92a9c4, 0x753e4fcc, 0x4f2c596e},
-       {0xba60eaa6, 0x9733f3a6, 0x77487ae7, 0xbd7fdf9c, 0xc8b6cc00, 0xd84f8612, 0x6162ffab, 0x476fa2fb},
-       {0xac5db47f, 0xd2fc5e69, 0x15d0b8e4, 0xa12a70a6, 0xbc8de5d9, 0x293b1d67, 0x57f86f5e, 0x0e4840ac},
-       {0xab28e208, 0xb750da4c, 0x3be95635, 0x501dff64, 0xf0b4b276, 0x8cbe2437, 0xa94a946e, 0x07d0c802},
-       {0x2fe322b8, 0x2cabadec, 0x15412560, 0x752c84f3, 0x1a3b0aef, 0x32a732ae, 0xa33dcbf2, 0x2e95da59},
-       {0xfe0c65f4, 0x33811ea1, 0x687f28a2, 0x15c1ad4c, 0x42dee7f4, 0xecfbede3, 0x9a5d88b1, 0x1bb46667},
-       {0x2d010ff9, 0xd58a5af4, 0x570bf109, 0x79efd6b0, 0x6350721d, 0x3ed6d55a, 0x58f43cef, 0x2f27b098},
-       {0x8c130477, 0x74a1f671, 0xb61e0abe, 0xa534af14, 0x620890d7, 0xeb674a1a, 0xca252472, 0x43527a8b},
-       {0x7ea8ee05, 0x450d9f97, 0x37d56fc0, 0x565af171, 0x93f9e9ac, 0xe155cb48, 0xc8e9101b, 0x110cebd0},
-       {0x59a0be92, 0x23c91599, 0x7a027759, 0x87d188ce, 0xcab3c3cc, 0x70491431, 0xb3f7f8da, 0x0ac00eb8},
-       {0x69583404, 0x13e96ade, 0x5306243d, 0x82c05727, 0x29ca9f2a, 0x77e48bf5, 0x1fe19595, 0x50646ac8},
-       {0xa97eccd4, 0xe6a354dd, 0x88fbbc57, 0x39929d2e, 0xd6e7b1c8, 0xa22ba63d, 0xf5f07f43, 0x42c22911},
-       {0xcfc35f7a, 0x137b458a, 0x29c01b06, 0x0caba63a, 0x7a02402c, 0x0409ee98, 0x56aa725b, 0x6709c6cd},
-       {0x8831e03e, 0x10251f7d, 0x7ff858ec, 0x77d85a93, 0x4fb9ac5c, 0xebe905bd, 0xf8727901, 0x05deb333},
-       {0xb9009408, 0xbf87b689, 0xdd3ccc96, 0x4f730e7d, 0x4610300c, 0xfd7f05ba, 0x0b8ac903, 0x5ef5e8db},
-       {0x17cd0c14, 0x64996884, 0x68812f7f, 0xa6728673, 0x22cc3253, 0x2e1d9a19, 0xaa0a1d80, 0x3a689e83},
-       {0x41144dea, 0x20b53cbe, 0xc2f0fcbd, 0x870c46fa, 0x537d6971, 0x556c35f6, 0x5f686d91, 0x3436287f},
-       {0x436ba2e7, 0x007e082a, 0x9116e877, 0x67c6630f, 0xfb4460f7, 0x36f8f165, 0x7e7046e0, 0x6eee34d5},
-       {0xa53a56d1, 0xc5b670ee, 0x53037d7b, 0x127d1f42, 0xa722c2e2, 0x57d4257e, 0x33cbd838, 0x03ae26a3},
-       {0x76504cf8, 0x1e914848, 0xb63edd02, 0x55bbbf1e, 0x4e55aa02, 0xbcdafec8, 0x2dc0beb0, 0x5145c4cd},
-       {0x1ab70e2c, 0x5b90153a, 0x75fb0ab8, 0x8deffa31, 0x46900c95, 0xc553ae23, 0x6bd3118c, 0x1d31dcdc},
-       {0x59a2e8eb, 0x801c894c, 0xe12fc974, 0xbc535c5c, 0x47d39803, 0x95508d27, 0xac5d094f, 0x16d9d3cd},
-       {0xcca1d8be, 0x810fa372, 0x82e0bfa7, 0xc67b8c28, 0xe2d35bc2, 0xdbb4edf0, 0x5087c995, 0x712d1580},
-       {0xfd88f133, 0xeb162203, 0xf010ea74, 0xac96c38f, 0xe64cfc70, 0x4307987f, 0x37b7a114, 0x350fe98d},
-       {0x42f2a254, 0xaba2f518, 0xa71efc0c, 0x4d7f3c3a, 0xd274a80a, 0x97ae418d, 0x5e3e7682, 0x2967385d},
-       {0x575a0b79, 0x75c55c7b, 0x74a7ded1, 0x3ba4a157, 0xa04fccf3, 0xc3974d73, 0x4a939684, 0x705aba4f},
-       {0x14ebb608, 0x8409a9ea, 0x66bac611, 0xfad0084e, 0x811c1dfb, 0x04287254, 0x23b30c29, 0x086d072b},
-       {0x67e4756a, 0xb427c9b3, 0x02ebc38d, 0xc7537fb9, 0xcd6a205f, 0x51de21be, 0x7923597d, 0x6064ab72},
-       {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2, 0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0xffffffff, 0xfffe5bfe, 0x53bda402, 0x09a1d805, 0x3339d808, 0x299d7d48, 0x73eda753},
-       {0x00000001, 0xfffeffff, 0x89fb5bfe, 0x67baa400, 0x939ed334, 0xa5e80b39, 0x299d7d47, 0x73eda753},
-       {0xae99502e, 0x6037fe81, 0x94b04fd8, 0x8e749036, 0xca86bf65, 0xbabc5aff, 0x5ce11044, 0x1333b22e},
-       {0x7dc08d74, 0x7f847ee4, 0x04eeaf5a, 0xbd433896, 0x1832fc60, 0xd66c91d6, 0x607e449b, 0x551115b4},
-       {0x4e7773cb, 0xee5bcecc, 0xf6dab086, 0x45593d6f, 0x4016e2bd, 0xa3a95d2d, 0xaf96816f, 0x047cb16c},
-       {0x982b68c5, 0xb891fa3f, 0x1d426b52, 0xa41e8501, 0x882952d6, 0x566009b5, 0x7b3c79d6, 0x199cdaee},
-       {0xcf28601b, 0x571ba2fc, 0xac74db12, 0x166fb582, 0x3501370b, 0x51420be4, 0x52f970ba, 0x1996fa8d},
-       {0x6a2f777a, 0xe9561c17, 0x2393991b, 0xc03cae03, 0x5a5bfd4f, 0x91b00023, 0x272e58ee, 0x6d64ed25},
-       {0xf02a116e, 0xfb350dbe, 0xb4543a3e, 0x1c510ebf, 0x37ad4eca, 0xf675522e, 0x80f82b2d, 0x1907a56e},
-       {0x4eb71aa6, 0xb0ad8003, 0xaa67e0be, 0x50a32c41, 0x19141f44, 0x105f0672, 0xa3dad316, 0x2bcd9508},
-       {0x0f6fb2ac, 0x3dc9e560, 0x9aa58ff5, 0x3cc5bb32, 0x36f376e1, 0xdeae67bc, 0x65ba213e, 0x394fda0d},
-       {0x60b82267, 0x09f239f7, 0x8b24f123, 0x14180e0e, 0x45625d95, 0xad5a5340, 0x6d174692, 0x58c3ba63},
-       {0x348b416f, 0x0acf21c2, 0xbc086439, 0x798b6bf6, 0xb1ca111d, 0x222d411f, 0x30ba1e0f, 0x044107b7},
-       {0x014abe84, 0xa3b861b8, 0x427ed008, 0x37c017e4, 0xae0ff4f5, 0xae51f613, 0xcb1218d3, 0x1a2d00e1},
-       {0x4de7eb2b, 0x48aaa3bf, 0x6772057d, 0x4a58d54d, 0x7093b551, 0xce25f16c, 0xd206337c, 0x242150ac},
-       {0x9ed57ae5, 0xdf3ec9ae, 0x7166577f, 0xea7df73a, 0x022fbbe4, 0x6ca8d281, 0x151e3f6b, 0x5850c003},
-       {0x645e1cfa, 0x903a0a0c, 0x34788c37, 0xfbac54cb, 0x8cf73d78, 0xdc127d11, 0x975d3c82, 0x6d0b5c7c},
-       {0x14b1ba04, 0xb49d6b05, 0xf00b84f2, 0x56e466b4, 0x0b904f22, 0x30c390cf, 0x3ee254cc, 0x3e11cfb7},
-       {0xbe8201ab, 0x84dfa547, 0x530715d2, 0x3887ce8b, 0x3eed4ed7, 0xa4c719c6, 0x8f8007b4, 0x18c44950},
-       {0x7d813cd1, 0xdaf0346d, 0xf755beb1, 0xeccf6f9a, 0xe08143e3, 0x167fce38, 0x6f5d6dfa, 0x545ad9b2},
-       {0x577605de, 0x973f5466, 0x974f953c, 0x0ce8986e, 0x074382f9, 0x8941cf4b, 0x6fa2672c, 0x156cd7f6},
-       {0x33b66141, 0x24315404, 0x1992f584, 0x5d1375ab, 0x8b20ca1a, 0xf193ffa6, 0x2701a503, 0x47880cd5},
-       {0xe9f7b9af, 0xf7b6847d, 0x62c83ce2, 0x9a339673, 0x6e5e6f79, 0xfabf4537, 0x35af33a3, 0x0975acd9},
-       {0x0eddd248, 0x4fb4204a, 0xc9e509b3, 0x8c98706a, 0x2bb27eb1, 0xd0be8987, 0xc831438b, 0x6ec5f960},
-       {0x20238f62, 0xa13c95b7, 0x83b476b9, 0x130aa097, 0x14860881, 0x758a04e0, 0x97066493, 0x58e2f8d6},
-       {0xe8bff41e, 0x65b09c73, 0x37f1c6a3, 0x8b3280e8, 0x2846fb21, 0xe17b82ce, 0xb1ae27df, 0x476534bf},
-       {0xd5fdb757, 0x8480c0e7, 0x365bf9fd, 0x3644eea0, 0xb776be86, 0x4ca116ca, 0x8b58390c, 0x17b6395f},
-       {0x252eb0db, 0x2c811e9a, 0x7479e161, 0x1b7d960d, 0xb0a89a26, 0xb3afc7c1, 0x32b5e793, 0x6a2f9533},
-       {0x08b8a7ad, 0xe877b2c4, 0x341652b4, 0x68b0e8f0, 0xe8b6a2d9, 0x2d44da3b, 0xfd09be59, 0x092778ff},
-       {0x7988f244, 0x84a1aa6f, 0x24faf63f, 0xa164b3d9, 0xc1bbb915, 0x7aae9724, 0xf386c0d2, 0x24e5d287},
-       {0x41a1b30c, 0xa70a7efd, 0x39f0e511, 0xc49c55a5, 0x033bb323, 0xab307a8f, 0x17acbd7f, 0x0158abd6},
-       {0x0f642025, 0x2c228b30, 0x01bd882b, 0xb0878e8d, 0xd7377fea, 0xd862b255, 0xf0490536, 0x18ac3666}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x80000001, 0x7fffffff, 0x7fff2dff, 0xa9ded201, 0x04d0ec02, 0x199cec04, 0x94cebea4, 0x39f6d3a9},
-       {0x40000001, 0x3fffffff, 0x3ffec4ff, 0xfece3b02, 0x07396203, 0x266b6206, 0x5f361df6, 0x56f23d7e},
-       {0x20000001, 0x1fffffff, 0x9ffe907f, 0xa945ef82, 0x086d9d04, 0x2cd29d07, 0xc469cd9f, 0x656ff268},
-       {0x10000001, 0x0fffffff, 0xcffe763f, 0xfe81c9c2, 0x8907ba84, 0xb0063a87, 0xf703a573, 0x6caeccdd},
-       {0x08000001, 0x07ffffff, 0xe7fe691f, 0x291fb6e2, 0xc954c945, 0xf1a00947, 0x9050915d, 0x704e3a18},
-       {0x04000001, 0x03ffffff, 0xf3fe628f, 0x3e6ead72, 0xe97b50a5, 0x126cf0a7, 0xdcf70753, 0x721df0b5},
-       {0x02000001, 0x01ffffff, 0xf9fe5f47, 0x491628ba, 0xf98e9455, 0xa2d36457, 0x834a424d, 0x7305cc04},
-       {0x01000001, 0x00ffffff, 0xfcfe5da3, 0x4e69e65e, 0x0198362d, 0xeb069e30, 0xd673dfca, 0x7379b9ab},
-       {0x00800001, 0x007fffff, 0xfe7e5cd1, 0x5113c530, 0x059d0719, 0x8f203b1c, 0x8008ae89, 0x73b3b07f},
-       {0x00400001, 0x003fffff, 0xff3e5c68, 0x5268b499, 0x079f6f8f, 0xe12d0992, 0x54d315e8, 0x73d0abe9},
-       {0x00200001, 0x801fffff, 0x7f9e5c33, 0x53132c4e, 0x08a0a3ca, 0x8a3370cd, 0x3f384998, 0x73df299e},
-       {0x00100001, 0x400fffff, 0xbfce5c19, 0xd3686828, 0x89213de7, 0x5eb6a46a, 0xb46ae370, 0x73e66878},
-       {0x00080001, 0x2007ffff, 0xdfe65c0c, 0x93930615, 0x49618af6, 0x48f83e39, 0xef04305c, 0x73ea07e5},
-       {0x00040001, 0x9003ffff, 0x6ff25c05, 0xf3a8550c, 0xa981b17d, 0x3e190b20, 0x8c50d6d2, 0x73ebd79c},
-       {0x00020001, 0x4801ffff, 0xb7f85c02, 0xa3b2fc87, 0x5991c4c1, 0x38a97194, 0xdaf72a0d, 0x73ecbf77},
-       {0x00010001, 0xa400ffff, 0x5bfb5c00, 0x7bb85045, 0x3199ce63, 0xb5f1a4ce, 0x824a53aa, 0x73ed3365},
-       {0x00008001, 0xd2007fff, 0x2dfcdbff, 0x67bafa24, 0x1d9dd334, 0x7495be6b, 0x55f3e879, 0x73ed6d5c},
-       {0x00004001, 0x69003fff, 0x96fd9bff, 0xddbc4f13, 0x939fd59c, 0xd3e7cb39, 0xbfc8b2e0, 0x73ed8a57},
-       {0x00002001, 0x34801fff, 0x4b7dfbff, 0x18bcf98b, 0xcea0d6d1, 0x8390d1a0, 0x74b31814, 0x73ed98d5},
-       {0x00001001, 0x1a400fff, 0x25be2bff, 0x363d4ec7, 0x6c21576b, 0x5b6554d4, 0x4f284aae, 0x73eda014},
-       {0x00000801, 0x0d2007ff, 0x12de43ff, 0x44fd7965, 0x3ae197b8, 0x474f966e, 0xbc62e3fb, 0x73eda3b3},
-       {0x00000401, 0x069003ff, 0x096e4fff, 0xcc5d8eb4, 0x2241b7de, 0xbd44b73b, 0x730030a1, 0x73eda583},
-       {0x00000201, 0x034801ff, 0x84b655ff, 0x100d995b, 0x95f1c7f2, 0xf83f47a1, 0x4e4ed6f4, 0x73eda66b},
-       {0x00000101, 0x01a400ff, 0x425a58ff, 0xb1e59eaf, 0xcfc9cffb, 0x95bc8fd4, 0x3bf62a1e, 0x73eda6df},
-       {0x00000081, 0x00d2007f, 0x212c5a7f, 0x82d1a159, 0x6cb5d400, 0x647b33ee, 0x32c9d3b3, 0x73eda719},
-       {0x00000041, 0x0069003f, 0x10955b3f, 0xeb47a2ae, 0x3b2bd602, 0xcbda85fb, 0x2e33a87d, 0x73eda736},
-       {0x00000021, 0x0034801f, 0x8849db9f, 0x1f82a358, 0xa266d704, 0xff8a2f01, 0xabe892e2, 0x73eda744},
-       {0x00000011, 0x001a400f, 0xc4241bcf, 0xb9a023ad, 0xd6045784, 0x99620384, 0xeac30815, 0x73eda74b},
-       {0x00000009, 0x000d2007, 0x62113be7, 0x06aee3d8, 0x6fd317c5, 0xe64dedc6, 0x8a3042ae, 0x73eda74f},
-       {0x00000005, 0x00069003, 0xb107cbf3, 0x2d3643ed, 0x3cba77e5, 0x8cc3e2e7, 0x59e6dffb, 0x73eda751},
-       {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752},
-       {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}}};
+    static constexpr storage<8> rou = {0x0b912f1f, 0x1b788f50, 0x70b3e094, 0xc4024ff2,
+                                       0xd168d6c0, 0x0fd56dc8, 0x5b416b6f, 0x0212d79e};
+    TWIDDLES(modulus, rou)
  };

  /**
--- a/icicle/include/fields/snark_fields/bn254_base.cuh
+++ b/icicle/include/fields/snark_fields/bn254_base.cuh
@@ -2,43 +2,15 @@
 #ifndef BN254_BASE_PARAMS_H
 #define BN254_BASE_PARAMS_H

-#include "../storage.cuh"
+#include "fields/storage.cuh"
+#include "fields/params_gen.cuh"

 namespace bn254 {
  struct fq_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned modulus_bit_count = 254;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
-                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
-                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
-                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
-                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
-      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
-      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
-      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
-    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
-                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
-                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
-                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+    static constexpr storage<8> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
+                                           0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    PARAMS(modulus)
+
    // nonresidue to generate the extension field
    static constexpr uint32_t nonresidue = 1;
    // true if nonresidue is negative
--- a/icicle/include/fields/snark_fields/bn254_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bn254_scalar.cuh
@@ -2,138 +2,19 @@
 #ifndef BN254_SCALAR_PARAMS_H
 #define BN254_SCALAR_PARAMS_H

-#include "../storage.cuh"
-#include "../field.cuh"
-#include "../quadratic_extension.cuh"
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+#include "fields/params_gen.cuh"

 namespace bn254 {
  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 28;
-    static constexpr unsigned modulus_bit_count = 254;
-    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<8> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848,
+                                           0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    PARAMS(modulus)

-    static constexpr storage<limbs_count> modulus = {0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848,
-                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xe0000002, 0x87c3eb27, 0xf372e122, 0x5067d090,
-                                                       0x0302b0ba, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0xc0000004, 0x0f87d64f, 0xe6e5c245, 0xa0cfa121,
-                                                       0x06056174, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<limbs_count> neg_modulus = {0x0fffffff, 0xbc1e0a6c, 0x86468f6e, 0xd7cc17b7,
-                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xf0000001, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0xe0000001, 0x08c3eb27, 0xdcb34000, 0xc7f26223, 0x68c9bb7f, 0xffe9a62c, 0xe821ddb0, 0xa6ce1975,
-      0x47b62fe7, 0x2c77527b, 0xd379d3df, 0x85f73bb0, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0xc0000002, 0x1187d64f, 0xb9668000, 0x8fe4c447, 0xd19376ff, 0xffd34c58, 0xd043bb61, 0x4d9c32eb,
-      0x8f6c5fcf, 0x58eea4f6, 0xa6f3a7be, 0x0bee7761, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x80000004, 0x230fac9f, 0x72cd0000, 0x1fc9888f, 0xa326edff, 0xffa698b1, 0xa08776c3, 0x9b3865d7,
-      0x1ed8bf9e, 0xb1dd49ed, 0x4de74f7c, 0x17dceec3, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
-
-    static constexpr storage<limbs_count> m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520,
-                                               0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695,
-                                                          0x7879462e, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x090ef5a9,
-                                                              0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
-       {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72},
-       {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1},
-       {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2},
-       {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6},
-       {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d},
-       {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd},
-       {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561},
-       {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e},
-       {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1},
-       {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584},
-       {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596},
-       {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49},
-       {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651},
-       {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f},
-       {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb},
-       {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb},
-       {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf},
-       {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724},
-       {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1},
-       {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980},
-       {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46},
-       {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63},
-       {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7},
-       {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb},
-       {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a},
-       {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702},
-       {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
-       {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
-       {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
-       {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
-       {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
-       {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
-       {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053},
-       {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327},
-       {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821},
-       {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7},
-       {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21},
-       {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2},
-       {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58},
-       {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f},
-       {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766},
-       {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7},
-       {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010},
-       {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0},
-       {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0},
-       {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d},
-       {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8},
-       {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439},
-       {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5},
-       {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370},
-       {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd},
-       {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9},
-       {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f},
-       {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0xf8000001, 0xa1f0fac9, 0x3cdcb848, 0x9419f424, 0x40c0ac2e, 0xdc2822db, 0x7098d014, 0x18322739},
-       {0xf4000001, 0xf2e9782e, 0x5b4b146c, 0xde26ee36, 0xe1210245, 0x4a3c3448, 0x28e5381f, 0x244b3ad6},
-       {0x72000001, 0x1b65b6e1, 0x6a82427f, 0x832d6b3f, 0xb1512d51, 0x81463cff, 0x850b6c24, 0x2a57c4a4},
-       {0xb1000001, 0x2fa3d63a, 0xf21dd988, 0x55b0a9c3, 0x196942d7, 0x1ccb415b, 0xb31e8627, 0x2d5e098b},
-       {0x50800001, 0xb9c2e5e7, 0x35eba50c, 0x3ef24906, 0xcd754d9a, 0x6a8dc388, 0x4a281328, 0x2ee12bff},
-       {0xa0400001, 0xfed26dbd, 0x57d28ace, 0xb39318a7, 0xa77b52fb, 0x116f049f, 0x15acd9a9, 0x2fa2bd39},
-       {0xc8200001, 0x215a31a8, 0xe8c5fdb0, 0x6de38077, 0x147e55ac, 0x64dfa52b, 0xfb6f3ce9, 0x300385d5},
-       {0x5c100001, 0xb29e139e, 0x313fb720, 0xcb0bb460, 0xcaffd704, 0x8e97f570, 0x6e506e89, 0x3033ea24},
-       {0x26080001, 0xfb400499, 0x557c93d8, 0xf99fce54, 0xa64097b0, 0xa3741d93, 0xa7c10759, 0x304c1c4b},
-       {0x8b040001, 0x1f90fd16, 0x679b0235, 0x10e9db4e, 0x13e0f807, 0xade231a5, 0x447953c1, 0x3058355f},
-       {0x3d820001, 0x31b97955, 0x70aa3963, 0x1c8ee1cb, 0xcab12832, 0xb3193bad, 0x12d579f5, 0x305e41e9},
-       {0x96c10001, 0x3acdb774, 0xf531d4fa, 0xa2616509, 0x26194047, 0xb5b4c0b2, 0xfa038d0f, 0x3061482d},
-       {0x43608001, 0xbf57d684, 0x3775a2c5, 0x654aa6a9, 0x53cd4c52, 0xb7028334, 0x6d9a969c, 0x3062cb50},
-       {0x19b04001, 0x819ce60c, 0xd89789ab, 0xc6bf4778, 0x6aa75257, 0x37a96475, 0xa7661b63, 0x30638ce1},
-       {0x04d82001, 0x62bf6dd0, 0xa9287d1e, 0x777997e0, 0xf614555a, 0x77fcd515, 0x444bddc6, 0x3063edaa},
-       {0xfa6c1001, 0xd350b1b1, 0x9170f6d7, 0xcfd6c014, 0x3bcad6db, 0x18268d66, 0x92bebef8, 0x30641e0e},
-       {0xf5360801, 0x8b9953a2, 0x859533b4, 0x7c05542e, 0x5ea6179c, 0xe83b698e, 0xb9f82f90, 0x30643640},
-       {0x729b0401, 0xe7bda49b, 0x7fa75222, 0xd21c9e3b, 0x7013b7fc, 0x5045d7a2, 0xcd94e7dd, 0x30644259},
-       {0xb14d8201, 0x15cfcd17, 0xfcb0615a, 0xfd284341, 0x78ca882c, 0x844b0eac, 0x57634403, 0x30644866},
-       {0xd0a6c101, 0xacd8e155, 0x3b34e8f5, 0x12ae15c5, 0x7d25f045, 0x9e4daa31, 0x9c4a7216, 0x30644b6c},
-       {0xe0536081, 0x785d6b74, 0xda772cc3, 0x1d70ff06, 0xff53a451, 0x2b4ef7f3, 0xbebe0920, 0x30644cef},
-       {0x6829b041, 0x5e1fb084, 0xaa184eaa, 0x22d273a7, 0x406a7e57, 0xf1cf9ed5, 0x4ff7d4a4, 0x30644db1},
-       {0x2c14d821, 0xd100d30c, 0x11e8df9d, 0x25832df8, 0xe0f5eb5a, 0x550ff245, 0x1894ba67, 0x30644e12},
-       {0x0e0a6c11, 0x8a716450, 0x45d12817, 0xa6db8b20, 0x313ba1db, 0x86b01bfe, 0x7ce32d48, 0x30644e42},
-       {0xff053609, 0x6729acf1, 0x5fc54c54, 0x6787b9b4, 0x595e7d1c, 0x1f8030da, 0xaf0a66b9, 0x30644e5a},
-       {0xf7829b05, 0xd585d142, 0x6cbf5e72, 0xc7ddd0fe, 0x6d6feabc, 0x6be83b48, 0xc81e0371, 0x30644e66},
-       {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
-       {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}};
+    static constexpr storage<8> rou = {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e,
+                                       0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0};
+    TWIDDLES(modulus, rou)
  };

  /**
--- a/icicle/include/fields/snark_fields/bw6_761_base.cuh
+++ b/icicle/include/fields/snark_fields/bw6_761_base.cuh
@@ -3,76 +3,15 @@
 #define BW6_761_BASE_BASE_H

 #include "fields/storage.cuh"
+#include "fields/params_gen.cuh"

 namespace bw6_761 {
  struct fq_config {
-    static constexpr unsigned limbs_count = 24;
-    static constexpr unsigned modulus_bit_count = 761;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {
-      0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
-      0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
-      0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
-    static constexpr storage<limbs_count> modulus_2 = {
-      0x00000116, 0xe93a0000, 0xe0000105, 0xcd227cd0, 0xd5e1486f, 0x2c19f15d, 0xaccf51f0, 0x31422d84,
-      0xe7d7fe5d, 0xe3b9a7b8, 0x25f3fb20, 0x0d1391da, 0x4b684609, 0x079d75fe, 0xcb09d232, 0xe0f74c71,
-      0x010f7c82, 0xa504ebdf, 0x03a28d10, 0x724c30d5, 0x09f5fe7d, 0xa30f9280, 0xf7079c15, 0x0245d049};
-    static constexpr storage<limbs_count> modulus_4 = {
-      0x0000022c, 0xd2740000, 0xc000020b, 0x9a44f9a1, 0xabc290df, 0x5833e2bb, 0x599ea3e0, 0x62845b09,
-      0xcfaffcba, 0xc7734f71, 0x4be7f641, 0x1a2723b4, 0x96d08c12, 0x0f3aebfc, 0x9613a464, 0xc1ee98e3,
-      0x021ef905, 0x4a09d7be, 0x07451a21, 0xe49861aa, 0x13ebfcfa, 0x461f2500, 0xee0f382b, 0x048ba093};
-    static constexpr storage<limbs_count> neg_modulus = {
-      0xffffff75, 0x0b62ffff, 0x8fffff7d, 0x196ec197, 0x150f5bc8, 0xe9f30751, 0xa9985707, 0x675ee93d,
-      0x8c1400d1, 0x8e232c23, 0xed06026f, 0x79763712, 0xda4bdcfb, 0xfc314500, 0x1a7b16e6, 0x8f8459c7,
-      0x7f7841be, 0xad7d8a10, 0x7e2eb977, 0x46d9e795, 0xfb0500c1, 0x2e7836bf, 0x047c31f5, 0xfedd17db};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
-      0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
-      0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00004b79, 0xa27e0000, 0xa0008e35, 0xbae96db2, 0x82ebf7b1, 0x4aaf1d22, 0x7224cb3d, 0x7908fd92,
-      0x29b17ed1, 0x6fe68290, 0xafc968db, 0xfe1b7282, 0x9028bbf0, 0xe1e548cb, 0x3a8ffc03, 0x09094ed6,
-      0x61e9cf95, 0xd63ea631, 0x54918abf, 0xe834ca62, 0x52aa651e, 0xe52594ed, 0xb4c46a4f, 0xe2423252,
-      0x6c09aae4, 0xa8cf17d8, 0xc5f5cee5, 0x2d80ffb0, 0x55bbc10d, 0x2dede100, 0xe2360382, 0x1f4e7a7c,
-      0xae2fe433, 0x586c3847, 0x78eadae1, 0x915c56e1, 0x69a5ce00, 0xa35b2945, 0x767c08ca, 0x9d66e7fe,
-      0xd8b88c77, 0x7e44cf6a, 0x67c9c873, 0xb29bfc93, 0xbbc80af9, 0x6a24005a, 0xc64ce3d5, 0x00014a92};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x000096f2, 0x44fc0000, 0x40011c6b, 0x75d2db65, 0x05d7ef63, 0x955e3a45, 0xe449967a, 0xf211fb24,
-      0x5362fda2, 0xdfcd0520, 0x5f92d1b6, 0xfc36e505, 0x205177e1, 0xc3ca9197, 0x751ff807, 0x12129dac,
-      0xc3d39f2a, 0xac7d4c62, 0xa923157f, 0xd06994c4, 0xa554ca3d, 0xca4b29da, 0x6988d49f, 0xc48464a5,
-      0xd81355c9, 0x519e2fb0, 0x8beb9dcb, 0x5b01ff61, 0xab77821a, 0x5bdbc200, 0xc46c0704, 0x3e9cf4f9,
-      0x5c5fc866, 0xb0d8708f, 0xf1d5b5c2, 0x22b8adc2, 0xd34b9c01, 0x46b6528a, 0xecf81195, 0x3acdcffc,
-      0xb17118ef, 0xfc899ed5, 0xcf9390e6, 0x6537f926, 0x779015f3, 0xd44800b5, 0x8c99c7aa, 0x00029525};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00012de4, 0x89f80000, 0x800238d6, 0xeba5b6ca, 0x0bafdec6, 0x2abc748a, 0xc8932cf5, 0xe423f649,
-      0xa6c5fb45, 0xbf9a0a40, 0xbf25a36d, 0xf86dca0a, 0x40a2efc3, 0x8795232e, 0xea3ff00f, 0x24253b58,
-      0x87a73e54, 0x58fa98c5, 0x52462aff, 0xa0d32989, 0x4aa9947b, 0x949653b5, 0xd311a93f, 0x8908c94a,
-      0xb026ab93, 0xa33c5f61, 0x17d73b96, 0xb603fec3, 0x56ef0434, 0xb7b78401, 0x88d80e08, 0x7d39e9f3,
-      0xb8bf90cc, 0x61b0e11e, 0xe3ab6b85, 0x45715b85, 0xa6973802, 0x8d6ca515, 0xd9f0232a, 0x759b9ff9,
-      0x62e231de, 0xf9133dab, 0x9f2721cd, 0xca6ff24d, 0xef202be6, 0xa890016a, 0x19338f55, 0x00052a4b};
-    static constexpr storage<limbs_count> m = {0x2507e899, 0x11629ccd, 0x2e4424dd, 0xab1eef5b, 0x481d2cfa, 0xb82146a9,
-                                               0x34e4227b, 0xf3182afa, 0xbeb25621, 0xf615fdb5, 0xccc261d6, 0xc4d8988c,
-                                               0xaaf4fab0, 0x3590d652, 0x2ab9ff30, 0x9c5d0a04, 0x6ec3f460, 0xf6e8534f,
-                                               0x88075ab4, 0xe8d78b06, 0x6f3fc8fe, 0xa8d3675b, 0x7bc5cd4b, 0x03852086};
-    static constexpr storage<limbs_count> one = {
-      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {
-      0xffff85d5, 0x0202ffff, 0x8fff8ce7, 0x5a582635, 0x827faade, 0x9e996e43, 0x0ee47df4, 0xda6aff32,
-      0x1d94b80b, 0xece9cb3e, 0x5248240b, 0xc0e667a2, 0xdcad3905, 0xa74da5bf, 0x462f2103, 0x2352e7fe,
-      0x08b1c87c, 0x7b565880, 0xe711022f, 0x45848a63, 0x9f65a9df, 0xd7a81ebb, 0xf127e87d, 0x0051f77e};
-    static constexpr storage<limbs_count> montgomery_r_inv = {
-      0x181fa3f1, 0x27c2b2a0, 0x25a0e1b8, 0x7d9ca9f9, 0x0a004a5d, 0x35a910f0, 0xdb6b8539, 0x54655b3f,
-      0x7695ef18, 0x5e763565, 0x4fae56bb, 0x226022c2, 0xb70d7652, 0x80e7f067, 0x72116b89, 0x435a8b4a,
-      0x5d84e0d4, 0xac258fd6, 0x4427c7b2, 0x47ee8ac5, 0xd04e621b, 0x478c4048, 0x2add3e93, 0x00e0aa7d};
+    static constexpr storage<24> modulus = {0x0000008b, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae,
+                                            0x5667a8f8, 0x98a116c2, 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed,
+                                            0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, 0x8087be41, 0x528275ef,
+                                            0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+    PARAMS(modulus)
  };
 } // namespace bw6_761

--- a/icicle/include/fields/snark_fields/bw6_761_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bw6_761_scalar.cuh
@@ -4,7 +4,6 @@

 #include "fields/storage.cuh"
 #include "fields/field.cuh"
-#include "fields/quadratic_extension.cuh"
 #include "fields/snark_fields/bls12_377_base.cuh"

 namespace bw6_761 {
--- a/icicle/include/fields/stark_fields/babybear.cuh
+++ b/icicle/include/fields/stark_fields/babybear.cuh
@@ -1,48 +1,17 @@
 #pragma once

-#include "../storage.cuh"
-#include "../field.cuh"
-#include "../quartic_extension.cuh"
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+#include "fields/quartic_extension.cuh"
+#include "fields/params_gen.cuh"

 namespace babybear {
  struct fp_config {
-    static constexpr unsigned limbs_count = 1;
-    static constexpr unsigned omegas_count = 28;
-    static constexpr unsigned modulus_bit_count = 31;
-    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<1> modulus = {0x78000001};
+    PARAMS(modulus)

-    static constexpr storage<limbs_count> modulus = {0x78000001};
-    static constexpr storage<limbs_count> modulus_2 = {0xf0000002};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000000};
-    static constexpr storage<limbs_count> neg_modulus = {0x87ffffff};
-    static constexpr storage<2 * limbs_count> modulus_wide = {0x78000001, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {0xf0000001, 0x38400000};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {0xe0000002, 0x70800001};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {0xc0000004, 0xe1000003};
-
-    static constexpr storage<limbs_count> m = {0x88888887};
-    static constexpr storage<limbs_count> one = {0x00000001};
-    static constexpr storage<limbs_count> zero = {0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xffffffe};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x38400000};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x78000000}, {0x10faa3e0}, {0x6b615c47}, {0x21ceed5a}, {0x2c1c3348}, {0x36c54c86}, {0x701dd01c},
-       {0x56a9a28e}, {0x03e4cabf}, {0x5bacde79}, {0x1eb53838}, {0x1cd781af}, {0x0961a0b7}, {0x65098a87},
-       {0x77851a0b}, {0x5bcba331}, {0x053fc0f5}, {0x5bf816e5}, {0x4bb124ab}, {0x571e9d4e}, {0x313732cb},
-       {0x28aca172}, {0x4e319b52}, {0x45692d95}, {0x14ff4ba1}, {0x00004951}, {0x00000089}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x78000000}, {0x67055c21}, {0x5ee99486}, {0x0bb4c4e4}, {0x4ab33b27}, {0x044b4497}, {0x410e23aa},
-       {0x08a7ee2b}, {0x563cb93d}, {0x3d70b4b7}, {0x77d999f1}, {0x6ceb65b5}, {0x49e7f635}, {0x0eae3a8c},
-       {0x238b8a78}, {0x70d71b0a}, {0x0eaacc45}, {0x5af0f193}, {0x47303308}, {0x573cbfad}, {0x29ff72c0},
-       {0x05af9dac}, {0x00ef24df}, {0x26985530}, {0x22d1ce4b}, {0x08359375}, {0x2cabe994}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x3c000001}, {0x5a000001}, {0x69000001}, {0x70800001}, {0x74400001}, {0x76200001}, {0x77100001},
-       {0x77880001}, {0x77c40001}, {0x77e20001}, {0x77f10001}, {0x77f88001}, {0x77fc4001}, {0x77fe2001},
-       {0x77ff1001}, {0x77ff8801}, {0x77ffc401}, {0x77ffe201}, {0x77fff101}, {0x77fff881}, {0x77fffc41},
-       {0x77fffe21}, {0x77ffff11}, {0x77ffff89}, {0x77ffffc5}, {0x77ffffe3}, {0x77fffff2}}};
+    static constexpr storage<1> rou = {0x00000089};
+    TWIDDLES(modulus, rou)

    // nonresidue to generate the extension field
    static constexpr uint32_t nonresidue = 11;
@@ -58,5 +27,5 @@ namespace babybear {
  /**
   * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is.
   */
-  typedef ExtensionField<fp_config> extension_t;
+  typedef ExtensionField<fp_config, scalar_t> extension_t;
 } // namespace babybear
--- a/icicle/include/fields/stark_fields/m31.cuh
+++ b/icicle/include/fields/stark_fields/m31.cuh
@@ -0,0 +1,224 @@
+#pragma once
+
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+#include "fields/quartic_extension.cuh"
+
+namespace m31 {
+  template <class CONFIG>
+  class MersenneField : public Field<CONFIG>
+  {
+  public:
+    HOST_DEVICE_INLINE MersenneField(const MersenneField& other) : Field<CONFIG>(other) {}
+    HOST_DEVICE_INLINE MersenneField(const uint32_t& x = 0) : Field<CONFIG>({x}) {}
+    HOST_DEVICE_INLINE MersenneField(storage<CONFIG::limbs_count> x) : Field<CONFIG>{x} {}
+    HOST_DEVICE_INLINE MersenneField(const Field<CONFIG>& other) : Field<CONFIG>(other) {}
+
+    static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero); }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField one() { return MersenneField(CONFIG::one.limbs[0]); }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField from(uint32_t value) { return MersenneField(value); }
+
+    static HOST_INLINE MersenneField rand_host() { return MersenneField(Field<CONFIG>::rand_host()); }
+
+    static void rand_host_many(MersenneField* out, int size)
+    {
+      for (int i = 0; i < size; i++)
+        out[i] = rand_host();
+    }
+
+    HOST_DEVICE_INLINE MersenneField& operator=(const Field<CONFIG>& other)
+    {
+      if (this != &other) { Field<CONFIG>::operator=(other); }
+      return *this;
+    }
+
+    HOST_DEVICE_INLINE uint32_t get_limb() const { return this->limbs_storage.limbs[0]; }
+
+    //  The `Wide` struct represents a redundant 32-bit form of the Mersenne Field.
+    struct Wide {
+      uint32_t storage;
+      static constexpr HOST_DEVICE_INLINE Wide from_field(const MersenneField& xs)
+      {
+        Wide out{};
+        out.storage = xs.get_limb();
+        return out;
+      }
+      static constexpr HOST_DEVICE_INLINE Wide from_number(const uint32_t& xs)
+      {
+        Wide out{};
+        out.storage = xs;
+        return out;
+      }
+      friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys)
+      {
+        uint64_t tmp = (uint64_t)xs.storage + ys.storage;                   // max: 2^33 - 2 = 2^32(1) + (2^32 - 2)
+        tmp = ((tmp >> 32) << 1) + (uint32_t)(tmp);                         // 2(1)+(2^32-2) = 2^32(1)+(0)
+        return from_number((uint32_t)((tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(1) + 0 = 2
+      }
+      friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys)
+      {
+        uint64_t tmp = CONFIG::modulus_3 + xs.storage -
+                       ys.storage; // max: 3(2^31-1) + 2^32-1 - 0 = 2^33 + 2^31-4 = 2^32(2) + (2^31-4)
+        return from_number(((uint32_t)(tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(2)+(2^31-4) = 2^31
+      }
+      template <unsigned MODULUS_MULTIPLE = 1>
+      static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs)
+      {
+        uint64_t tmp = CONFIG::modulus_3 - xs.storage;                      // max: 3(2^31-1) - 0 = 2^32(1) + (2^31 - 3)
+        return from_number(((uint32_t)(tmp >> 32) << 1) + (uint32_t)(tmp)); // max: 2(1)+(2^31-3) = 2^31 - 1
+      }
+      friend HOST_DEVICE_INLINE Wide operator*(Wide xs, const Wide& ys)
+      {
+        uint64_t t1 = (uint64_t)xs.storage * ys.storage; // max: 2^64 - 2^33+1 = 2^32(2^32 - 2) + 1
+        t1 = ((t1 >> 32) << 1) + (uint32_t)(t1);         // max: 2(2^32 - 2) + 1 = 2^32(1) + (2^32 - 3)
+        return from_number((((uint32_t)(t1 >> 32)) << 1) + (uint32_t)(t1)); // max: 2(1) - (2^32 - 3) = 2^32 - 1
+      }
+    };
+
+    static constexpr HOST_DEVICE_INLINE MersenneField div2(const MersenneField& xs, const uint32_t& power = 1)
+    {
+      uint32_t t = xs.get_limb();
+      return MersenneField{{((t >> power) | (t << (31 - power))) & MersenneField::get_modulus().limbs[0]}};
+    }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField neg(const MersenneField& xs)
+    {
+      uint32_t t = xs.get_limb();
+      return MersenneField{{t == 0 ? t : MersenneField::get_modulus().limbs[0] - t}};
+    }
+
+    template <unsigned MODULUS_MULTIPLE = 1>
+    static constexpr HOST_DEVICE_INLINE MersenneField reduce(Wide xs)
+    {
+      const uint32_t modulus = MersenneField::get_modulus().limbs[0];
+      uint32_t tmp = (xs.storage >> 31) + (xs.storage & modulus); // max: 1 + 2^31-1 = 2^31
+      tmp = (xs.storage >> 31) + (xs.storage & modulus);          // max: 1 + 0 = 1
+      return MersenneField{{tmp == modulus ? 0 : tmp}};
+    }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField inverse(const MersenneField& x)
+    {
+      uint32_t xs = x.limbs_storage.limbs[0];
+      if (xs <= 1) return xs;
+      uint32_t a = 1, b = 0, y = xs, z = MersenneField::get_modulus().limbs[0], e, m = z;
+      while (1) {
+#ifdef __CUDA_ARCH__
+        e = __ffs(y) - 1;
+#else
+        e = __builtin_ctz(y);
+#endif
+        y >>= e;
+        if (a >= m) {
+          a = (a & m) + (a >> 31);
+          if (a == m) a = 0;
+        }
+        a = ((a >> e) | (a << (31 - e))) & m;
+        if (y == 1) return a;
+        e = a + b;
+        b = a;
+        a = e;
+        e = y + z;
+        z = y;
+        y = e;
+      }
+    }
+
+    friend HOST_DEVICE_INLINE MersenneField operator+(MersenneField xs, const MersenneField& ys)
+    {
+      uint32_t m = MersenneField::get_modulus().limbs[0];
+      uint32_t t = xs.get_limb() + ys.get_limb();
+      if (t > m) t = (t & m) + (t >> 31);
+      if (t == m) t = 0;
+      return MersenneField{{t}};
+    }
+
+    friend HOST_DEVICE_INLINE MersenneField operator-(MersenneField xs, const MersenneField& ys)
+    {
+      return xs + neg(ys);
+    }
+
+    friend HOST_DEVICE_INLINE MersenneField operator*(MersenneField xs, const MersenneField& ys)
+    {
+      uint64_t x = (uint64_t)(xs.get_limb()) * ys.get_limb();
+      uint32_t t = ((x >> 31) + (x & MersenneField::get_modulus().limbs[0]));
+      uint32_t m = MersenneField::get_modulus().limbs[0];
+      if (t > m) t = (t & m) + (t >> 31);
+      if (t > m) t = (t & m) + (t >> 31);
+      if (t == m) t = 0;
+      return MersenneField{{t}};
+    }
+
+    static constexpr HOST_DEVICE_INLINE Wide mul_wide(const MersenneField& xs, const MersenneField& ys)
+    {
+      return Wide::from_field(xs) * Wide::from_field(ys);
+    }
+
+    template <unsigned MODULUS_MULTIPLE = 1>
+    static constexpr HOST_DEVICE_INLINE Wide sqr_wide(const MersenneField& xs)
+    {
+      return mul_wide(xs, xs);
+    }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField sqr(const MersenneField& xs) { return xs * xs; }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField to_montgomery(const MersenneField& xs) { return xs; }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField from_montgomery(const MersenneField& xs) { return xs; }
+
+    static constexpr HOST_DEVICE_INLINE MersenneField pow(MersenneField base, int exp)
+    {
+      MersenneField res = one();
+      while (exp > 0) {
+        if (exp & 1) res = res * base;
+        base = base * base;
+        exp >>= 1;
+      }
+      return res;
+    }
+  };
+  struct fp_config {
+    static constexpr unsigned limbs_count = 1;
+    static constexpr unsigned omegas_count = 1;
+    static constexpr unsigned modulus_bit_count = 31;
+    static constexpr unsigned num_of_reductions = 1;
+
+    static constexpr storage<limbs_count> modulus = {0x7fffffff};
+    static constexpr storage<limbs_count> modulus_2 = {0xfffffffe};
+    static constexpr uint64_t modulus_3 = 0x17ffffffd;
+    static constexpr storage<limbs_count> modulus_4 = {0xfffffffc};
+    static constexpr storage<limbs_count> neg_modulus = {0x87ffffff};
+    static constexpr storage<2 * limbs_count> modulus_wide = {0x7fffffff, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {0x00000001, 0x3fffffff};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {0x00000002, 0x7ffffffe};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {0x00000004, 0xfffffffc};
+
+    static constexpr storage<limbs_count> m = {0x80000001};
+    static constexpr storage<limbs_count> one = {0x00000001};
+    static constexpr storage<limbs_count> zero = {0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x00000001};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x00000001};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {{{0x7ffffffe}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {{{0x7ffffffe}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {{{0x40000000}}};
+
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 11;
+    // true if nonresidue is negative.
+    static constexpr bool nonresidue_is_negative = false;
+  };
+
+  /**
+   * Scalar field. Is always a prime field.
+   */
+  typedef MersenneField<fp_config> scalar_t;
+
+  /**
+   * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is.
+   */
+  typedef ExtensionField<fp_config, scalar_t> extension_t;
+} // namespace m31
--- a/icicle/include/fields/stark_fields/stark252.cuh
+++ b/icicle/include/fields/stark_fields/stark252.cuh
@@ -2,626 +2,18 @@

 #include "fields/storage.cuh"
 #include "fields/field.cuh"
+#include "fields/params_gen.cuh"

 // modulus = 3618502788666131213697322783095070105623107215331596699973092056135872020481 (2^251+17*2^192+1)
 namespace stark252 {
  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned modulus_bit_count = 252;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr unsigned omegas_count = 192;
+    static constexpr storage<8> modulus = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                           0x00000000, 0x00000000, 0x00000011, 0x08000000};
+    PARAMS(modulus)

-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                     0x00000000, 0x00000000, 0x00000011, 0x08000000};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
-                                                       0x00000000, 0x00000000, 0x00000022, 0x10000000};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
-                                                       0x00000000, 0x00000000, 0x00000044, 0x20000000};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
-                                                         0xffffffff, 0xffffffff, 0xffffffee, 0xf7ffffff};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000022, 0x10000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000121, 0x10000000, 0x00000001, 0x00400000};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000044, 0x20000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000242, 0x20000000, 0x00000002, 0x00800000};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000088, 0x40000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000484, 0x40000000, 0x00000004, 0x01000000};
-    static constexpr storage<limbs_count> m = {0x8c81fffb, 0x00000002, 0xfeccf000, 0xffffffff,
-                                               0x0000907f, 0x00000000, 0xffffffbc, 0x1fffffff};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xffffffe1, 0xffffffff, 0xffffffff, 0xffffffff,
-                                                          0xffffffff, 0xffffffff, 0xfffffdf0, 0x07ffffff};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                              0x00000121, 0x10000000, 0x00000001, 0x00400000};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000},
-       {0xf41337e3, 0x2a616626, 0xac8320da, 0xc5268e56, 0x4329f8c7, 0x53312066, 0x29a2995b, 0x06250239},
-       {0xee6feebb, 0x3ada5e1d, 0xe4412e87, 0x98c62155, 0x2f9c676e, 0xc90adb1e, 0x0de874d9, 0x063365fe},
-       {0x6021e539, 0x8337c45f, 0xbbf30245, 0xb0bdf467, 0x514425f3, 0x4537602d, 0x88826aba, 0x05ec467b},
-       {0x9b48a8ab, 0x2225638f, 0x1a8e7981, 0x26da375d, 0xce6246af, 0xfcdca219, 0x9ecd5c85, 0x0789ad45},
-       {0xb2703765, 0xd6871506, 0xf9e225ec, 0xd09bd064, 0x10826800, 0x5e869a07, 0xe82b2bb5, 0x0128f0fe},
-       {0xdd4af20f, 0xfdab65db, 0x56f9ddbc, 0xefa66822, 0x1b03a097, 0x587781ce, 0x9556f9b8, 0x000fcad1},
-       {0xff0cb347, 0x9f1bc8d7, 0xd0e87cd5, 0xc4d78992, 0xdd51a717, 0xbc7924d5, 0xfd121b58, 0x00c92ecb},
-       {0xc13a1d0b, 0xcc4074a0, 0xe3bc8e32, 0xa1f811a9, 0x6d4b9bd4, 0x0234b46e, 0x7880b4dc, 0x011d07d9},
-       {0xec89c4f1, 0xa206c054, 0xdc125289, 0x653d9e35, 0x711825f5, 0x72406af6, 0x46a03edd, 0x0659d839},
-       {0x0fa30710, 0x45391692, 0x11b54c6c, 0xd439f572, 0xa3492c1e, 0xed5ebbf4, 0xb5d9a6de, 0x010f4d91},
-       {0x7afd187f, 0x9273dbbc, 0x91ee171f, 0xdb5375bc, 0x6749ae3d, 0xc061f425, 0x6ec477cf, 0x003d14df},
-       {0x3112b02d, 0x8171e1da, 0xadf9bf78, 0x5c4564eb, 0x5689b232, 0x68c34184, 0x6538624f, 0x0363d70a},
-       {0x606082e1, 0x3e5a42f0, 0x76fc314a, 0x5edd09f0, 0x0f673d7c, 0xd650df25, 0x34832dba, 0x0393a32b},
-       {0x13a77460, 0xe3efc75d, 0x62ef8a01, 0x93898bc8, 0x8bdbd9b3, 0x1c3a6e5c, 0x611b7206, 0x034b5d5d},
-       {0x309d9da9, 0x80ee9837, 0xf51eddbc, 0x1646d633, 0x4901fab8, 0xb9d2cd85, 0x9978ee09, 0x01eb6d84},
-       {0x2755bfac, 0xa7b1f98c, 0xeb7aa1c1, 0x9ec8116c, 0x3109e611, 0x0eeadedd, 0xc9761a8a, 0x06a6f98d},
-       {0x9745a046, 0xce7b0a8b, 0xe411ee63, 0x7ff61841, 0x635f8799, 0x34f67453, 0xef852560, 0x04768803},
-       {0xbffaa9db, 0x1727fce0, 0xf973dc22, 0x858f5918, 0x223f6558, 0x3e277fa0, 0xf71614e3, 0x02d25658},
-       {0x8574e81f, 0xe3d47b99, 0x7fc4c648, 0xc727c9af, 0xee93dc85, 0x581d81ca, 0xca8a00d9, 0x0594beaf},
-       {0x0e5ffcb8, 0x00654744, 0xe7c1b2fd, 0x030530a6, 0xecbf157b, 0x27e46d76, 0xbeea04f1, 0x01f4c2bf},
-       {0x3e3a2f4b, 0xead33145, 0xd6482f17, 0xd841544d, 0x8d24a344, 0x9822fb10, 0x31eeac7c, 0x03e43835},
-       {0xb40bdbe8, 0x01af11c3, 0xb32a3b23, 0xd7c9c0a1, 0xcd0be360, 0x81cb2e43, 0xafb3df1a, 0x01054544},
-       {0x77156db2, 0xf6b13488, 0xddc0f211, 0x1ad6f3be, 0xd664f4da, 0xe643d3ea, 0x174a8e80, 0x071a47b8},
-       {0x4ca88ffc, 0xb86b03a4, 0x8ef9a25a, 0x6e3398e6, 0xf5fa4665, 0xce9a0d37, 0x5c437763, 0x06e8e769},
-       {0x4586dbc3, 0x32609f1d, 0xaa2da684, 0x03148f22, 0x4795d346, 0xa679e36b, 0x9e51225c, 0x03d8d2c7},
-       {0xea5f81cf, 0xeac5be9e, 0x64c12e72, 0x102e16b2, 0xfee282e4, 0xce0bc0d9, 0xa93b28f3, 0x01f05206},
-       {0xbb6422f9, 0x258e96d2, 0x617c5468, 0x751615d8, 0x6056f032, 0x27145cb6, 0x81c06d84, 0x057a7971},
-       {0xb030713c, 0xf42231bb, 0x3a96c59e, 0xae9c3f9a, 0xf1ee840c, 0x5397e8e2, 0xf2b87657, 0x05e7deca},
-       {0xf81f58b4, 0x209745aa, 0x91af248d, 0x74a64310, 0xc04b00b7, 0xe566a8e1, 0x80fb4cea, 0x022bde40},
-       {0x5de74517, 0x8265b62b, 0xb9b9f2c9, 0x6a788149, 0xa9565d98, 0x6fec2239, 0x573f0c28, 0x060ac0c4},
-       {0xd3ce8992, 0xc129d0f1, 0x81c43de5, 0x719252eb, 0x48221e1a, 0xfea566de, 0x0be8ced2, 0x050732ed},
-       {0x2216f1c8, 0x9aae0db3, 0xd7220015, 0x95e231ac, 0x6340df6f, 0xbd6ae160, 0x16a6e39c, 0x0166c8e2},
-       {0x76b0a92e, 0x3ccd9d2b, 0x7d671a9d, 0x1feb39d7, 0x2109fd56, 0x3c49a630, 0x5d4ec292, 0x07badc4b},
-       {0x5dd8c4c3, 0x081c3166, 0xec14ba21, 0x9dca12d8, 0xcf93b2e5, 0xf58069e2, 0x571ddc34, 0x02399005},
-       {0x08a616fc, 0x65a19cf4, 0x8aea6ff7, 0x860d442c, 0x6896a559, 0x4f24ab19, 0x3d7f5ae6, 0x0685db92},
-       {0x622478c4, 0x051093f0, 0x3fab8962, 0x5c200627, 0x21254c39, 0x2aa7ae1b, 0x7b116fb9, 0x0100fff9},
-       {0x00637050, 0x2693b834, 0x22440235, 0x3fef7c1b, 0x3481c4fe, 0x31150ac1, 0xf261b6de, 0x0772cb7a},
-       {0xd990d491, 0x6966804c, 0xc7505f35, 0x46aba1bc, 0xaceeb7f7, 0x4f696cba, 0x6474b8f0, 0x02b73cad},
-       {0xf39cd3e8, 0x7d13e948, 0x62a1db76, 0xd5c33593, 0x4d1be159, 0x7fd3b59b, 0x3676644e, 0x066d3f61},
-       {0xb3bd8b7e, 0x5a896ef3, 0xba5762ab, 0x2319450a, 0x1a545f8b, 0x226f0a07, 0x55446d35, 0x02760973},
-       {0x140e5623, 0x38eaa186, 0x94be15ba, 0x5a48d469, 0xad75d32a, 0xe4f1f15b, 0x2f14e2f1, 0x039ccdaa},
-       {0xe6fcfdb2, 0xad7108d3, 0x9c9f7f04, 0xfadfc050, 0x9df95366, 0xdbb20071, 0xe555c739, 0x02c4d3fa},
-       {0xc3111bcb, 0xb640956f, 0xbb11fb86, 0xcd942bbd, 0xa3db81cd, 0xa4b4eb09, 0x684fdb65, 0x041ed5ed},
-       {0xdd5ca525, 0x462b41fa, 0x153c3d28, 0xbcc17ccd, 0x6b06db5c, 0x8a81d137, 0x4a050358, 0x05f5cf39},
-       {0xcc60fb85, 0x374012a6, 0x34d1905d, 0x978f9785, 0x4e17ff38, 0x713383d4, 0x1055c25d, 0x07f3796f},
-       {0x0643771f, 0x852ba56e, 0x86781a31, 0xadfa956c, 0xb26a3811, 0x2ee2fccf, 0xdbd56ba7, 0x009214ce},
-       {0x68bc148c, 0xe2bf6c4b, 0x01c203ce, 0xd38dbf38, 0x97923b55, 0x27f73df4, 0x5081f7d9, 0x030a2e81},
-       {0xf11422a0, 0xbe23b78f, 0x99cdc2e0, 0xd4f3510d, 0xaa13ffe5, 0xcb05b3da, 0xc724e0c5, 0x028d98a5},
-       {0x96934000, 0x15277271, 0x588c8a51, 0x8013dd5e, 0x9ed55af8, 0x77772f7c, 0x03549e60, 0x020895f8},
-       {0x34db29f8, 0xc0cc8556, 0x67455b5d, 0x5582a9ff, 0x8a9a38b5, 0x12862a43, 0xa59fd242, 0x059655bc},
-       {0x94ceaf98, 0x39bc5131, 0xc71ccc0d, 0x99f4d1a0, 0x54acb87c, 0xc565794d, 0xc33590ef, 0x0593fcef},
-       {0xe97bf51c, 0xa2922d09, 0x3200d367, 0xdbb866a2, 0x4ad9302d, 0x05849ed8, 0xdf93f2b5, 0x000c447e},
-       {0x850fb317, 0x2755d6c2, 0xd45eb3f5, 0x36feeeea, 0xdfbc1d97, 0x4f4471d7, 0x4e3003f8, 0x07ec8926},
-       {0xb6a791f1, 0x38b8dc2a, 0x27a1bbb1, 0x79d6de48, 0xcad54cf2, 0x78c40b06, 0xa43bc898, 0x036dd150},
-       {0x1cc4133c, 0xefa72477, 0x477d39be, 0x5327d617, 0x2c5db3a4, 0xfd1de1f9, 0xc9a18a1c, 0x0147819b},
-       {0xf8133966, 0x275e6b02, 0x87969b48, 0x82bc79b9, 0x5d1e2f0e, 0x85b1f9bd, 0xc819531b, 0x00f9ea29},
-       {0x120edfab, 0x9e0392a5, 0xe3681a15, 0x07403ad4, 0x8a1c3817, 0xa8d469d8, 0x89f15c6f, 0x0395e7fc},
-       {0x641826ac, 0x7f405a9f, 0x6861e2ce, 0xa566e755, 0xba82a050, 0x8a3a08ba, 0xea63598d, 0x071dd923},
-       {0x5f65c188, 0x1d2b7538, 0xd6fc9625, 0xcb704d0f, 0xf59deccc, 0x18729111, 0x52fe1979, 0x07595020},
-       {0x8a08756f, 0x0175aa1c, 0x7fa7c6c4, 0x9a76a312, 0x6e93f6f3, 0x0bfa523a, 0x258c2f23, 0x03d70de4},
-       {0x8229376d, 0x8a0b9d02, 0x2c65c94e, 0x08421430, 0xd34b0aa6, 0x1160b441, 0xbbfb9491, 0x03b9eb75},
-       {0x827caf53, 0x91874856, 0x37e8a006, 0xdfdcae7a, 0x04e3af6b, 0x6dcfc3f2, 0xba66ff37, 0x0592823d},
-       {0x72fb8b0d, 0xb0a6628d, 0xa72b1f03, 0x7d3eef8b, 0x8dd54dbe, 0x5be965ba, 0x96d1fe4c, 0x0114a278},
-       {0x06051d55, 0x0256d8e6, 0xb9fa9dcc, 0xbf152353, 0x44140d6e, 0x6ef2c68c, 0xc9c0fea6, 0x015f291a},
-       {0xed992efc, 0xa1826724, 0x771da991, 0x9a58fd99, 0xd0b370a1, 0xce51a153, 0x826df846, 0x03c53bf5},
-       {0xcc7bf8c3, 0x3909aad7, 0xb08ddfa2, 0xd408ae7d, 0xff94d9fc, 0x2e9ab5d6, 0xf11cbcf6, 0x0020a1b2},
-       {0x3e257b43, 0x448fff07, 0x5fd9edca, 0x00f4a128, 0x7b429f71, 0x6f8987e3, 0x0fc8b522, 0x013336c1},
-       {0x062bd860, 0xef78ac4c, 0xf5d787d2, 0x6539ee52, 0xbb65576e, 0x113b6071, 0x9f3d7f85, 0x0160e952},
-       {0xf966d24e, 0x0c4e7c07, 0x318277e8, 0x011853d8, 0x7c287f58, 0x93bae650, 0xf64289f7, 0x00b974a1},
-       {0x30408cb9, 0x66d19420, 0x0430b017, 0x709ca6c6, 0x23d95951, 0xb174ad46, 0x111f4192, 0x030762f8},
-       {0xf246c901, 0xb9d70015, 0x57a1cdec, 0xd3616cb1, 0x0d732fdb, 0x61aab25e, 0x12d620d8, 0x0712858b},
-       {0x16334e1a, 0x8ec7e113, 0xa96aeeab, 0x0021a55b, 0xfd639175, 0x8f4c1366, 0x69bc866a, 0x07acdde9},
-       {0x23088fc7, 0x1fb24e5e, 0x92a88089, 0xcacd65df, 0x17343c48, 0x103ec3c8, 0xc387a3b5, 0x03d296b9},
-       {0xcd9fedee, 0xae703c5b, 0x7853b30d, 0xd0c3e0c6, 0x12abaef5, 0xc1e326b3, 0x5d57bb23, 0x04f42d7f},
-       {0x1824b92c, 0x19cd1b4e, 0x81ebc117, 0xc5daaff4, 0xb8183a1d, 0xeeedaa59, 0xe28baf8a, 0x069d8f0c},
-       {0x9dc50729, 0x9733e8df, 0xf1b9f411, 0xd7e0dbb9, 0x50edf7ea, 0x59e4dbd2, 0x4059cb5f, 0x002259fe},
-       {0xb79a92b1, 0x5e3197fc, 0x59086db1, 0xbfddf5c5, 0xdbea4a69, 0x234d8639, 0x4d0a367d, 0x05dd79b0},
-       {0xa86eec0c, 0x8cc1d845, 0x573b44d7, 0x3cac8839, 0x7b0de880, 0x8b8d8735, 0x68c99722, 0x01c5ef12},
-       {0xc2ba0f23, 0x12680395, 0x471f947e, 0xd43bcf85, 0xcc9d9b24, 0x19935b68, 0x108eec6a, 0x06263e1e},
-       {0x5b7be972, 0x29617bad, 0xc55b1b68, 0x0ab73eef, 0x2544381e, 0x07f12359, 0x63a080a0, 0x0161444d},
-       {0x312f9080, 0x07a4b921, 0x2f530413, 0x64c25a07, 0x7d71ca2f, 0x3f6903d7, 0x04838ba1, 0x06917cab},
-       {0x10bdb6cc, 0xec7cfc1f, 0x3bcf85c7, 0x7046910d, 0x7bc3ff5f, 0x7ef09e22, 0x385306d4, 0x004b0b60},
-       {0x3a41158a, 0x82d06d78, 0xaa690d1f, 0x37c4a361, 0x7117c44a, 0x700766e1, 0xab40d7e4, 0x031261d0},
-       {0x91b88258, 0x384c5e8b, 0x009b84dc, 0xd777abd5, 0xe7eed224, 0x02102b55, 0xdbefe5e9, 0x03b22830},
-       {0x8770a4be, 0xec982f60, 0x961f56ad, 0x4b92533d, 0xf428c4b9, 0x7df85fbb, 0x2d9291a4, 0x057e4876},
-       {0xf4910a60, 0x6ace9477, 0x9fc63b7f, 0xdb5a705f, 0x72328369, 0x4cc157b4, 0xc282db6f, 0x05b8acbc},
-       {0x57269216, 0x4c69edd9, 0xbfee24ac, 0xd04f1eeb, 0x2a069b18, 0xacda8418, 0x5990b523, 0x03761a4f},
-       {0xc608d246, 0x7f2e2048, 0x4664959b, 0xd4f52ed2, 0x11c1d565, 0x354e3bf7, 0x457eabd3, 0x0156d837},
-       {0xd455f483, 0xea8cbefd, 0x5d940684, 0x33cd5725, 0x8091a287, 0x2d89a777, 0x939b3ef3, 0x06159e4a},
-       {0x4fa405aa, 0xe43439f1, 0xdbe5763d, 0xa258cfc7, 0x78d7b607, 0x9491173a, 0x9ad23eac, 0x01775d66},
-       {0xd772d637, 0x2413e92c, 0x5eac4588, 0x22c99c9f, 0x71a0cdd2, 0xa2bd1d06, 0xfdd73a36, 0x05e88acb},
-       {0xb2bfa1ad, 0x68886b35, 0x35d2dfb6, 0x7a969b62, 0x9767a44a, 0x359ddb45, 0x52e5da6d, 0x00f1a46e},
-       {0x1c5a4861, 0x4ef9fe94, 0x1c841a89, 0x1540cf67, 0xa9bed4f5, 0x8b51336f, 0xf63c32ab, 0x0240fc41},
-       {0x87086e50, 0x7f5c626d, 0x049c46e2, 0x38ec0386, 0x0c597ea7, 0x30b003fd, 0x6660a912, 0x07a8faa1},
-       {0x7dac5d19, 0x2810d2b4, 0x80339f39, 0x040470c4, 0xc946ab30, 0x30d97769, 0x52667151, 0x019fa1f9},
-       {0x5e7c57a2, 0x00e13c8e, 0x2a0fb7bd, 0x95490ca0, 0x08451e35, 0x6af2b76d, 0xcf78c579, 0x04c3a3a1},
-       {0x55e39071, 0xa848b2f2, 0xf132ce21, 0x6831da1d, 0xe080e2ec, 0x439bdda4, 0xadd19a7d, 0x06680f09},
-       {0x6be27786, 0xfebd2a8b, 0x093a5a7f, 0x2cdd8f78, 0xdcb004b3, 0xbc0746a1, 0xd12450ed, 0x005f950a},
-       {0x39759f39, 0xe1462ca6, 0x7bbe087d, 0x0c37dca2, 0x0c8661cb, 0x198de347, 0x7e531b52, 0x03602655},
-       {0x66d7eb25, 0xaf24ead2, 0x5ee6eb03, 0x27cea560, 0x4f6267c7, 0xe9aa6d50, 0xe5dd28e0, 0x00c962b1},
-       {0xb11706c9, 0x3c3407a5, 0xcf0e1b88, 0x44370686, 0x9fbda5e3, 0x5d0e7af0, 0x41cf0a6b, 0x010d235f},
-       {0x358cfcc2, 0x1fbc42a3, 0xc78f7dac, 0x5a2e6ea2, 0xa12773f2, 0x33e089ca, 0xed7788c1, 0x04bef156},
-       {0xbea42f88, 0xdb150649, 0x5f3fb72a, 0x71329f69, 0x86b82de7, 0x7aa46ad0, 0xc6093912, 0x07913b17},
-       {0xb3b67067, 0xb2b074ae, 0xc55f4455, 0x4f17674d, 0xdeb0740d, 0x9a112816, 0x316cc0d3, 0x06bd0cde},
-       {0x1a264ab3, 0x962ceb6b, 0xd99f7159, 0xd5930255, 0x24a4096e, 0x7db961b0, 0x3e50dfed, 0x050c8e5c},
-       {0x443af109, 0xc3eebe54, 0x86946633, 0x2ca03fcb, 0x04badff6, 0x6e6eef04, 0x82210754, 0x05d92ab7},
-       {0xa5c0dca4, 0xcbadd8ad, 0x5ac103a0, 0x4cf688cf, 0x26e5d435, 0x571dbdb9, 0x220fc7db, 0x074ffc4d},
-       {0x88740c3e, 0x70b80432, 0x03821aa8, 0x4a959d50, 0xe4df06d8, 0x3eb8c3a0, 0xcac57496, 0x025a425b},
-       {0x55205413, 0xdcadfd29, 0x90b17b01, 0xda7456d2, 0x73696a28, 0x437c2fda, 0x329f6855, 0x00a8a188},
-       {0xa828431e, 0x3cde2cdd, 0x9ed29340, 0x60e6c362, 0x7c13e145, 0xef00dfa9, 0xba288c0b, 0x04159bec},
-       {0x9065f8ee, 0x41d351cd, 0xa4845868, 0x4e2e298f, 0xbdb3834a, 0xbcba6ac1, 0xea85f2ec, 0x042c8871},
-       {0x1fda880f, 0xc4dc0d20, 0x26fc2d5c, 0x4f0f9dc4, 0x86839de7, 0x2c555343, 0xf698dd8f, 0x04d12da8},
-       {0x21bd655a, 0x3a6299bd, 0x8cfd772f, 0x2e4aea22, 0xd2c2590d, 0x09716ad9, 0xb298587d, 0x053b143c},
-       {0xa95e3cbf, 0xd35f3e32, 0x04eac3cf, 0xe380dee7, 0x0f7e3e6b, 0x27e6570a, 0xbed46774, 0x008cd288},
-       {0x9583f023, 0xe42676b0, 0x75cfaa7e, 0x39d57dd6, 0x4f0bb727, 0x10d4a8d0, 0x27c81bdd, 0x016b03c9},
-       {0x4decc603, 0x89b394f7, 0xd24690f4, 0xd7322ee9, 0x947a00fd, 0xbbc12961, 0x82e8fa75, 0x00886d23},
-       {0xeb0faad4, 0x7b48a33b, 0x60e0b0c8, 0x4c11ef26, 0x36f0f791, 0x4163a401, 0xa4074faf, 0x07986fea},
-       {0x31d9587e, 0x96044919, 0x9049fd2d, 0xb1cab341, 0x9c0eea09, 0xf28c83c9, 0x5c6620aa, 0x033b74dd},
-       {0x13ee028c, 0xde558d16, 0x5d4233b0, 0x4dcf3932, 0x2e422803, 0x7bd46887, 0xe1261bff, 0x04b4757d},
-       {0xd48e9b00, 0x6c80848f, 0x10b6a121, 0x937c1e6e, 0xe9f2008c, 0x7782f8b8, 0x2bc7171c, 0x00217358},
-       {0x324228d8, 0xba523265, 0x682ee17c, 0x4ebe5506, 0x3be009f9, 0x6c646fe8, 0x8594b924, 0x046de7bc},
-       {0x3b50645a, 0x270aa33a, 0x2a9c6282, 0x28fd23fd, 0xcfe96515, 0x5b2fa771, 0x3f812377, 0x063039de},
-       {0xaba4060a, 0xa1da52b0, 0x0374be67, 0x7f191fd6, 0x0d7d2126, 0x14c64d05, 0xf7f77381, 0x00419cb7},
-       {0xe4b19319, 0x07eda692, 0x0fef654e, 0x6190d3f6, 0x0b21ca7e, 0x893b0916, 0x073c48b4, 0x0367a3c7},
-       {0xc520e3ea, 0x8fd405b2, 0x487e93c9, 0x73b4f714, 0xd5142cff, 0x70b7ee88, 0xa320eca2, 0x058fb800},
-       {0x72ef3623, 0x3b5a8740, 0xaff370fd, 0xbff4af42, 0xe338258e, 0x64c137b0, 0xc7afafca, 0x05ac9917},
-       {0x82ccc89a, 0x99c46a0d, 0x9ff87868, 0x05ae3209, 0xa489481f, 0x6249b2a4, 0xbaead348, 0x0056c235},
-       {0xba0ea95e, 0x5a0640f3, 0xc03af976, 0x518db5cd, 0x5a250a06, 0x1c3223aa, 0xbc3442eb, 0x0397b942},
-       {0xacf14a4f, 0x164f0705, 0x33eb6c0e, 0x386c2325, 0xd7264573, 0xdfaceff6, 0xd1e22f80, 0x00e94509},
-       {0x9ff51bc7, 0x8964ee48, 0x57bbca04, 0x3e0f5037, 0x6510630c, 0xe78d6c8d, 0xdf0a61c1, 0x041d6351},
-       {0x45aa1b58, 0x47892f3b, 0x915c1c70, 0x5a1787ba, 0x67f20d25, 0xbaa23359, 0x0c4bc4be, 0x00e1919f},
-       {0xb9975332, 0x2a87c37a, 0xcdecebc9, 0x95db523f, 0x1d0db226, 0x703949ee, 0x4c3842dd, 0x03152c1d},
-       {0xecfb6f72, 0x0eff7e6a, 0x9493a628, 0xb3a83455, 0xd596cd51, 0xced58dd1, 0x25ee51ff, 0x033dee78},
-       {0x72a30547, 0x1f4047ca, 0xd40b6d0f, 0x9feefa06, 0x94db1b38, 0x836ffd80, 0xa0992ed5, 0x037c79f6},
-       {0xceb3dffd, 0x7ffa095d, 0x768e2cb3, 0x23097a65, 0x373f6222, 0xd228b1f9, 0xc57feea2, 0x06309a6b},
-       {0xecd4c6f7, 0x7a5bead4, 0x7e70f7de, 0xab92043c, 0x220db8d8, 0xf78f890e, 0x2865a07e, 0x052eeb98},
-       {0xdf253531, 0x8e9a6336, 0xbafa937b, 0xb24b664a, 0x303b1f5a, 0xc89f660e, 0x876bd8c7, 0x07ea9749},
-       {0x1d4c3fec, 0xd958e726, 0x06fbef31, 0xa5eb368f, 0xba6a027d, 0x0c911679, 0x5f80f992, 0x06321b51},
-       {0x046b49b2, 0x3ca61d9e, 0x6aa9c29a, 0x616a47d6, 0x9e9462dc, 0x27a7ffeb, 0x8971b70e, 0x0794ed38},
-       {0x9f47496f, 0xdb259a57, 0xa6b0481c, 0x7f3e3f90, 0x4afab47a, 0x76f42726, 0xc5a79505, 0x07b9da96},
-       {0x57e7aeed, 0x908e6450, 0x81648127, 0xe86db2fb, 0x8dd76882, 0x53f3c573, 0x72327da6, 0x02b37324},
-       {0x73a220ec, 0x82a941c9, 0x7f25beea, 0xb4cbecb7, 0xbfb061d6, 0x746ded71, 0x641b3f3d, 0x00f7af27},
-       {0xcbd4ba67, 0x69b8f4df, 0x3d526981, 0x5ee3ac6f, 0x145cef8c, 0x9372af4e, 0x72a31ef1, 0x05cc1cc6},
-       {0x62d1ba57, 0xce898b0d, 0xee3fa47e, 0x86ba0504, 0x4395b70d, 0xc68233b1, 0x80eb8d60, 0x024cfa58},
-       {0x74d51c41, 0x8fa83850, 0x60f8f9da, 0x5824a285, 0xaf1bea48, 0xa7a2067e, 0x5455acc3, 0x04ba49f2},
-       {0x324c6039, 0x0a1e223e, 0x7b18a9d0, 0x28312228, 0x88b6ecda, 0xb60c1f93, 0x687ba365, 0x053097d8},
-       {0xa7dae551, 0x5604b398, 0xe2e11609, 0x51f02e33, 0xe58e2094, 0x0b51a085, 0x3a3ecc28, 0x078679d6},
-       {0x92d52444, 0xe24b5528, 0x33d0fa70, 0xf77e35ad, 0x9bcbfb57, 0x8af5a7b7, 0x022748d2, 0x015c5f15},
-       {0xc993b168, 0xc002185c, 0x293ad856, 0x5586addb, 0x8ec50726, 0x69c1bfcf, 0x5fd97ea1, 0x00d514fc},
-       {0x8866c747, 0x52d7a9a2, 0x01d6ee05, 0x9bd77465, 0xc3a87a88, 0x576adf96, 0xfa69f0ec, 0x0693e89a},
-       {0x05903be3, 0xcfe50d90, 0xcf739179, 0xbe651dd1, 0x2ae70678, 0xba80ffda, 0xb55b06cc, 0x051dbe40},
-       {0x5585a6f0, 0x4adb5947, 0x9fa37e68, 0x14634b99, 0xa2a910a8, 0x27da5fbf, 0xa99c704d, 0x022a91ce},
-       {0xe2ddaacd, 0xfabab7b8, 0x60cf9603, 0x1edf6a83, 0xbfadddd3, 0x20b04218, 0xa81dbffa, 0x03e0ddb6},
-       {0xda25c9fd, 0xf9c1e3a3, 0xac57ece3, 0x41ff4e1e, 0xdd684055, 0x9ba50868, 0x46d8156a, 0x01b30314},
-       {0xab76a462, 0x30e067cc, 0x08f1b99b, 0x2d84c4c2, 0x73edc56f, 0x6b399ae0, 0x62cfacb2, 0x02f187e1},
-       {0x34fc5356, 0xb085758e, 0xf805fedf, 0xbafe9a1c, 0x95272d01, 0x0bcf423c, 0x1feca651, 0x01df4a81},
-       {0x4c264e97, 0xd3bd9833, 0xc08b1798, 0xc0b192be, 0xdc3ed49e, 0x42724e80, 0xbaee9a58, 0x04100303},
-       {0xe49749c9, 0xb653c919, 0x09f8e2fc, 0x07dbe557, 0xca71e551, 0xbb172d28, 0x7989c8fd, 0x07f5f801},
-       {0xdf1d9004, 0x9412a9f3, 0xbe90d67e, 0xddcf6d66, 0x4692f803, 0x1dbfd679, 0x524c2944, 0x04f4fae1},
-       {0x5707d134, 0xd413afdf, 0x887fd7e9, 0xf8a339cf, 0x84883580, 0xf74544f4, 0x851739e0, 0x0554f72a},
-       {0x59824907, 0xe3827564, 0x421182c9, 0x352eab2a, 0x8f8530f2, 0x19138257, 0x20275950, 0x04e3bf44},
-       {0x33f928b7, 0xef7660f9, 0xf5952362, 0xb7cb0619, 0xf17eb8d7, 0x5b24913b, 0x8e8b8082, 0x00f4804c},
-       {0x5bd84f3e, 0xe7020613, 0x736a1659, 0x7ee777e1, 0x0795844b, 0x34ca7cb6, 0x7503ddc3, 0x07ce12e4},
-       {0x6d8408a5, 0xbbbafb3f, 0x519dadca, 0xe0f02915, 0x0670f5d4, 0x5acba199, 0x4a93340f, 0x0056db45},
-       {0xe404f6c5, 0x73f8a435, 0x01731858, 0x68cd3f7a, 0xd01f3de9, 0x214d3134, 0xd5d75a88, 0x05fb76be},
-       {0xf976eb41, 0x3a66ad86, 0xcd08787a, 0x6401b6d3, 0x7d1e82a8, 0x575950f3, 0x55ee9d49, 0x00e34b33},
-       {0x0cc5cbf4, 0xbff2f4e6, 0xec205dcd, 0x5a6b430d, 0xc94862af, 0xa8114ab3, 0x2fe8be1f, 0x0247ecf5},
-       {0x8b98bf40, 0xded3bc57, 0xe26b66b3, 0xb658c8c4, 0x8d4220db, 0x8bd91c55, 0x94d2adea, 0x00d109f2},
-       {0xedeaec42, 0x0fbfd336, 0x5d407ae8, 0xd94f928d, 0x727e74b5, 0xe5e4a16b, 0xc8c22dd8, 0x06a550df},
-       {0x135e0ee9, 0xe378a012, 0x856a1aef, 0x5be86512, 0xd8febe77, 0x7de04ce2, 0xea43d59b, 0x03ddeed6},
-       {0x005a1d86, 0xc04dc48c, 0x6f29053d, 0x64f4bbd2, 0x9be0aef5, 0x10b1b3db, 0xcc625a0b, 0x03745ca5},
-       {0x1f4f0e85, 0x6c72bd40, 0xc2069cba, 0x4234afd0, 0xb99395f4, 0xc25b262f, 0xae0874e2, 0x0605f6a2},
-       {0xdd756b6d, 0x9513e0d4, 0xf0c137cd, 0x5127a167, 0x7f01c538, 0x1a12a425, 0x00a4483b, 0x068b3aaf},
-       {0x79bc6c86, 0x7a5b3e70, 0x375dc240, 0x5a337909, 0xe111d6ce, 0x46d6fe3c, 0x2ff2ca50, 0x02708b05},
-       {0x1524ad8c, 0x1181eb95, 0x52294490, 0xd0744ddc, 0x848605cf, 0x88ed5b7b, 0xb478c12a, 0x04b9cb49},
-       {0x27105dae, 0x98cb2411, 0xed5c1361, 0x3efa8fae, 0xd498e337, 0x6fa736a5, 0x1e369b4f, 0x038e3b07},
-       {0x98c8db7f, 0xbc5915ae, 0x50425ae8, 0x1f3c8f96, 0xfa86658a, 0x77d60416, 0x28ec2dda, 0x02bc8b30},
-       {0xb94bc10e, 0xad6794f2, 0x7e80093a, 0x7463b3f3, 0x90db4c79, 0x7bf5af53, 0x965c0cc4, 0x031531c6},
-       {0x7cc1083d, 0x66425289, 0xa45d785f, 0x778ba471, 0xbbc94c16, 0xe3f5c599, 0x9b92e036, 0x02606413},
-       {0xcf287faf, 0x191a2ea9, 0x823ddf07, 0xe6406a78, 0xaabe912b, 0xabcf2825, 0x7c48649a, 0x021dab44},
-       {0x65375f6c, 0x9465d77c, 0x65370520, 0x924e189c, 0x918f0105, 0x8be0ca5f, 0xb1925509, 0x07586d27},
-       {0x9302ac44, 0xe4fa93cb, 0xbf87d840, 0xf381ebbd, 0x44793049, 0x5027e7d9, 0xd3f09392, 0x0230b5c3},
-       {0x31d48a82, 0x123e992e, 0x729d40e2, 0xef2990c6, 0x0f331903, 0x946813e3, 0x112a2c4d, 0x022f575e},
-       {0xd4ee8cf7, 0x4b44764e, 0xdb576ebc, 0x4d44cff8, 0x0ab93ba1, 0xc6185d3a, 0x7e3f1e78, 0x0520c2d3},
-       {0xbc46b8b4, 0xd9446736, 0x91e2ede1, 0xc7776293, 0x87689930, 0x0323845f, 0x379293ae, 0x061e359f},
-       {0xb49b3a0a, 0x767a1747, 0x2b58f45e, 0x17e69346, 0x1425ad98, 0x10820519, 0x1b487ae5, 0x0367f384},
-       {0x92f8ac25, 0xe0407696, 0x2beb71a6, 0x9ca9d269, 0x2f0c2471, 0x914017ea, 0xf421a10d, 0x07709cc3},
-       {0xc3bb6a8f, 0x2c8ed622, 0xa2a1a8f2, 0x31c57cb6, 0x4bf6c316, 0x053924d5, 0x09563089, 0x0727b76a},
-       {0x09dc6b5c, 0x567be37f, 0x9476eb5d, 0x57e36f45, 0xee5be5b6, 0xf68488dd, 0x2884c2d7, 0x05ac1ff1},
-       {0x04173760, 0x0fc5b934, 0xda828f00, 0xe43272df, 0x2fad6e9c, 0x7e2ab5fe, 0x0a4995b3, 0x00e0a5eb},
-       {0x42f8ef94, 0x6070024f, 0xe11a6161, 0xad187148, 0x9c8b0fa5, 0x3f046451, 0x87529cfa, 0x005282db}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000011, 0x08000000},
-       {0x0becc81e, 0xd59e99d9, 0x537cdf25, 0x3ad971a9, 0xbcd60738, 0xaccedf99, 0xd65d66b5, 0x01dafdc6},
-       {0x4bc9ca34, 0xc8e6df6f, 0x5397aaca, 0xab8bfbc5, 0x94813e6e, 0xb5ea6773, 0xe295dda2, 0x0446ed3c},
-       {0x8145aa75, 0xd7981c5b, 0x3d174c52, 0xb14011ea, 0xe4721c1e, 0x647c9ba3, 0x6f6ac6dd, 0x05c3ed0c},
-       {0x6e0bef41, 0x9de8c5cf, 0xcee1b9b0, 0xec349cbb, 0x2121589c, 0xfe72ab05, 0x24c7669c, 0x03b1c96a},
-       {0x246766d8, 0xb878549e, 0xb5a03ab4, 0x8c5d8531, 0x7f1ec75e, 0x334a83ab, 0x46b146d7, 0x01342b29},
-       {0x31055652, 0x8c71bd50, 0x6081f8c3, 0x2eedac49, 0xab013740, 0x25164a76, 0xbca84bf7, 0x05c0a717},
-       {0xd0a6b4f5, 0x1ad37af3, 0x8ca50294, 0x6dc49fe3, 0x5d9529c3, 0x8357a7ff, 0xcefe8efe, 0x02c161bc},
-       {0x296fbf1c, 0x90a5fa7f, 0xc977b113, 0x18226a39, 0xc178262e, 0x9362d5c9, 0x40d28de5, 0x03a362d3},
-       {0x125ca33a, 0x04eeb1c0, 0x8437c604, 0xaa47a4c0, 0xa4d6bafe, 0x064426a2, 0xb8cc76db, 0x00ffbb44},
-       {0x179e2ebe, 0xecf0daf8, 0x2574403b, 0x942e643e, 0x6bf06f7c, 0x684d31aa, 0x244c675c, 0x003b2bde},
-       {0xfeccfccc, 0x96bc19dc, 0x269130b4, 0xbb26f74e, 0xd511649f, 0x15d57a9f, 0x7dcde3c3, 0x02d852a4},
-       {0x44ad0610, 0xb4a47f4c, 0x06fa1b55, 0xdc2f028f, 0xd25979ac, 0xd73ddcd4, 0x076e7f5d, 0x06ba7cbe},
-       {0x349eea63, 0xb0f43dd2, 0x3e64660d, 0x5e64466c, 0xc3bb94ce, 0x7206f426, 0xed4327aa, 0x036cb7c6},
-       {0xf248b36c, 0x6503e80b, 0xe36060ec, 0xb93dd56f, 0x95c2c067, 0x6d3b2763, 0x155023a7, 0x038e7d59},
-       {0xcdf92351, 0x140437ad, 0x2a5ab630, 0xb7a6e1b4, 0xd48175a5, 0xaa80b742, 0xd4afae89, 0x06a50046},
-       {0xaea51997, 0xe8cde2cd, 0x417e3754, 0x612806f6, 0xb940adf4, 0xe40a4a07, 0xa33929b2, 0x063f5efa},
-       {0x0c07573f, 0x0c0926df, 0xd8d4bee3, 0xa84e9027, 0x6bcd79ea, 0xf3776dfa, 0x523f55a8, 0x043a8517},
-       {0x66984d05, 0x5b7e4e45, 0xdb8c30c4, 0xb9381de7, 0xae86e4f6, 0xd7c15128, 0x809daae7, 0x0718f1ad},
-       {0xc1eae1a6, 0xe4fb0a7d, 0xa90a0813, 0xe5484134, 0x895df525, 0x24cca8f9, 0x1cedd2ee, 0x035fd390},
-       {0x82e87775, 0x0a87a942, 0x971f450b, 0x9f2b4b62, 0x8eae6f09, 0x1dc5aecd, 0x1c5686a6, 0x07547fa3},
-       {0x2e35511a, 0x785975cc, 0xa085c456, 0x4266bc82, 0x3abd5bfd, 0x45cf52e1, 0x7bd95ece, 0x019e8e43},
-       {0xae580194, 0xfad72a75, 0x2989ac16, 0xf2bb5a00, 0x55f2b4d0, 0x53fee728, 0x9c7a91e5, 0x02b9f95d},
-       {0x71200963, 0xb0062d2c, 0x1ac57a23, 0xe16e9f91, 0xc4bd9d3e, 0xaae7b169, 0x7f505f35, 0x07462151},
-       {0x57e31913, 0xcf7bd10e, 0x6a4d0ee4, 0x1a360a91, 0x31869e35, 0xb2ba4914, 0x18005db4, 0x07a62d5c},
-       {0xb4344711, 0x431f11e2, 0x6192c47e, 0x0cc3049c, 0xeb9c1bc3, 0x375dff93, 0x42071ee8, 0x03a75790},
-       {0x9ed81498, 0x4eb14251, 0x98b804ef, 0x5852dbc5, 0x56d7f20c, 0xe0c1be13, 0x20d69181, 0x023e7f68},
-       {0xe34f2d55, 0xf2eeb9b5, 0x2aad6f84, 0x63459f16, 0xbe37dbea, 0xf12099e7, 0x11b1a0fd, 0x06e45493},
-       {0x0d6c93ed, 0x63032f6a, 0x5a04829f, 0xd99cbcc8, 0x89608b5e, 0x80f20416, 0x9df329f4, 0x00bf4231},
-       {0x2710f927, 0xc7fc3d1b, 0x90d8503e, 0xc72d19af, 0x9940e689, 0xa9dcd3b8, 0x2da77ac9, 0x06fd386e},
-       {0x08b27bc2, 0xc800035f, 0x4dfacc03, 0xd98987cf, 0x1256e525, 0x24f8fdbf, 0x1f104273, 0x04c575f1},
-       {0x256c604a, 0x68b16e90, 0x6eba097d, 0x7f51023a, 0x1aeba9c8, 0x52c7629c, 0x4809d8da, 0x0575e850},
-       {0x4ac81249, 0x7439d2f9, 0x4fc31ff2, 0x351e4a62, 0xb3906ded, 0x68fb8313, 0x08507a35, 0x007d43d8},
-       {0x98859a12, 0xa87902b8, 0x73af55b3, 0x2f0d13e0, 0x1b9783c2, 0x5a46c66a, 0x2f5f71d4, 0x01045b06},
-       {0x604fce1e, 0x0c379595, 0x7fccc2b4, 0x20ab6eb8, 0xf1820ae7, 0xac0bc709, 0x93fb2b07, 0x07e7654f},
-       {0x246c4bf0, 0xa0e40811, 0x816b15e0, 0xe12accf5, 0x17938138, 0xee417239, 0x2c9a34fb, 0x004e092e},
-       {0xad2cd984, 0x6304351b, 0x4bf1aafc, 0x38546ca6, 0xf310e99f, 0x1fb81192, 0xb5376275, 0x07e89896},
-       {0x7b2d141d, 0xe4376a0b, 0x6dac220c, 0xea1795e5, 0xb19e1901, 0xd778ab50, 0xa94c274f, 0x077df905},
-       {0x16fcd6c7, 0x7039bab1, 0xa6ea1c94, 0x8eececb7, 0x0f122046, 0x84d26ab5, 0x22fd55a1, 0x053c5d48},
-       {0x72f11f65, 0xd43eb7bb, 0xb2a566d6, 0xfb538785, 0x3f35cbf5, 0xccc2cdc6, 0x7112504a, 0x06df5a9e},
-       {0x60ce9c30, 0x75efb55c, 0x3c541437, 0x991873ed, 0xdf0cbb3b, 0x37eaedcb, 0xb04c2858, 0x0278d7f0},
-       {0x1a06866b, 0x5757dd4e, 0x6570fa7f, 0x15c176b1, 0xafe89a1d, 0x9981b57f, 0xee0cb14c, 0x03c57f4d},
-       {0x503c31cd, 0x3438cd66, 0xc0736d4b, 0x34437e52, 0x2a9d1b28, 0xe825b769, 0x73c06ee7, 0x06955a3a},
-       {0x5c5e530e, 0xbbf0995a, 0x6569a2f9, 0xdee304b3, 0x5bd1a886, 0x3b9c993c, 0xc9cd050a, 0x00f66017},
-       {0xee755737, 0x3666e752, 0x74d0e317, 0xa13bfafc, 0x01d2f1bf, 0x17ab672a, 0x0778f525, 0x079dde3a},
-       {0xed8a25e9, 0x96a003c2, 0x8f347cec, 0x45d258fe, 0x96ea14ac, 0x68ff148d, 0xe148eda9, 0x058f4ec7},
-       {0xe2a700ab, 0x23baf732, 0x5202a945, 0x6434725a, 0x2e693363, 0xa19a338d, 0xbf2f39c6, 0x01d0ea7a},
-       {0x3ab52589, 0x5e571cad, 0x92240361, 0xe2916bb2, 0xdff5e354, 0xe6f8897b, 0x2ffa4707, 0x02a62880},
-       {0xef649a85, 0xaf446c62, 0xed4e461f, 0x14d8072f, 0x59993efa, 0x5a07f4e5, 0x72a3a652, 0x00dc28b6},
-       {0xf21511df, 0x139299d7, 0x4854ebc3, 0x8914e707, 0xbfd102a9, 0x9f3b5913, 0x3a5af894, 0x009dc24f},
-       {0x1f4ba4fa, 0x650e1d91, 0x1977bff0, 0x6ba67806, 0xaa9bbc1b, 0xffbdc531, 0x997408aa, 0x057b69b2},
-       {0x65fb1a91, 0x25c03e81, 0x7fd22618, 0x8682f98b, 0xf46cb453, 0xcad67f13, 0x5a80e5c6, 0x060ca599},
-       {0x94188f2a, 0xa7978a90, 0xdbb9338e, 0xd5fc8f0b, 0xcbdd84f0, 0xf8387e6d, 0xbbc743a3, 0x073ae131},
-       {0x0415bbcc, 0xafd00c46, 0x0df4a52a, 0x1a00eb6c, 0x0b96b594, 0x1ec67c64, 0x8e26b699, 0x01cb82a5},
-       {0x7f740f93, 0xf56319fb, 0x2e2f6ed7, 0xb40d559b, 0x75e19784, 0x63f96f04, 0xc31ba061, 0x06406929},
-       {0xfa5a3239, 0x22349e8b, 0xb9ca6bf9, 0xe1236395, 0x9b0017a4, 0x76ae5a8b, 0x17b7af03, 0x06cfb4ce},
-       {0xb51abfe6, 0x34938785, 0x1249edb6, 0x21f54c80, 0xab038972, 0x3bd1cc16, 0xa4a57a81, 0x0636b37f},
-       {0xf88717cf, 0xfda4a9a1, 0xee19d402, 0xf8fcba35, 0x47c9ba1b, 0x1ac940f6, 0xdd991440, 0x013c0ab3},
-       {0x3743adf4, 0x5082318a, 0x22440f94, 0x3293bae1, 0x8dd2d761, 0x4c2e6d7f, 0xcdc38c82, 0x07124118},
-       {0x76198779, 0xb031f8b7, 0x1b6c1944, 0x6742f602, 0x894a6134, 0xa18290db, 0xaba037dc, 0x035289d8},
-       {0x9f8a9b07, 0x4579e855, 0x4dca3764, 0x1e580662, 0xb8c8ef49, 0xda92152e, 0x8b54508a, 0x0444085a},
-       {0x34696648, 0x7f670ce1, 0xc05768d9, 0x2f00108f, 0x390fb519, 0x2d00a444, 0x1cd6f914, 0x015c468b},
-       {0xfe46c5f2, 0x00666cbf, 0x9f7174d6, 0xca4051c5, 0x8e4277f4, 0x1629882a, 0x6ee002a3, 0x00b3f261},
-       {0xc1dbb4f6, 0x418a2b86, 0x9a6ca270, 0x9f453ccc, 0x1d457b20, 0x1966471f, 0x80fd1319, 0x00b4d831},
-       {0x1c76c8b1, 0xa12f86a8, 0xc0125e48, 0x2772e424, 0x1459dfb8, 0x8d650644, 0xad06d01c, 0x02128e5c},
-       {0x3472799c, 0xcc8cc7f6, 0x2f511cae, 0xfbd97f95, 0x5ebbff71, 0xadd8818b, 0x09af0983, 0x00520540},
-       {0x8ec654cc, 0xcaab5dd4, 0x17ba15a9, 0xc05ad0a7, 0x36300a00, 0x4bda7469, 0x41bb0610, 0x02e486cd},
-       {0x2d6be8b5, 0x077ba983, 0xfe89eb7d, 0xdd5e728f, 0x63f9c51f, 0xe3c872fb, 0xce639995, 0x01f2f7a8},
-       {0xaa2ea7eb, 0xd82b1599, 0xa16489e0, 0x1be5d254, 0x173d3219, 0x19cb236a, 0x1fe63b23, 0x007dd45f},
-       {0x19dba628, 0xa27cc4d3, 0x5fd2e061, 0xf04ac441, 0x9307a758, 0xc7405333, 0x28c40fe4, 0x0103c707},
-       {0x54662aab, 0xb5129fd1, 0x59158f32, 0x2ec5b69b, 0x12c44eec, 0x6c7e6492, 0xe527abb2, 0x046e7c11},
-       {0xe32d46fe, 0xb9bf4936, 0xb08ef006, 0xf23ae18c, 0xe6a5179e, 0x5352cc59, 0x5bf7c0b8, 0x0753a621},
-       {0x9318db3a, 0x19f65bc2, 0x7e3d0014, 0x93ff3f79, 0x6beb580d, 0xf7f93c7f, 0xddd72603, 0x04fdb898},
-       {0xe184a935, 0xf7e1f88f, 0x1ad510f0, 0x82a0f047, 0x4c9ab6ca, 0xce0f7c44, 0x5104a95a, 0x0552304e},
-       {0x985bba5c, 0x06615580, 0xf487a1fb, 0x8ccd29a8, 0xeecf758d, 0xb3e15ed0, 0x857ce648, 0x05328783},
-       {0x6cb042b0, 0x5d1d5a22, 0x0277083c, 0x64375cf4, 0x5fa82215, 0xe8947dab, 0x86932495, 0x05e72829},
-       {0x8c3e2849, 0x5bf6f46a, 0x4924c8f4, 0x7e40314c, 0xdffd6118, 0x3c74a4ba, 0x2f8de20a, 0x05247cdd},
-       {0xd0042d11, 0x25a418c5, 0x2f7da60c, 0x1b60ee9f, 0x02c0b69f, 0x61c041ad, 0x15670214, 0x0632d33a},
-       {0x90e05a92, 0x32b03a5e, 0x78d1e8d6, 0xfb12a1b1, 0x5bc2f5d5, 0xb8af534e, 0xa032918a, 0x05ab4772},
-       {0x0a711a9d, 0x096878a8, 0x6b083c8c, 0x87d070da, 0x87d06afb, 0x77931578, 0xf3104057, 0x03705277},
-       {0xdf993e46, 0x502d2374, 0x35baf646, 0xc1cd2868, 0xe30aa213, 0xa61b54b6, 0xbce34b74, 0x02511017},
-       {0x90a6b9b9, 0xcfb6c51a, 0x8be6ade8, 0x4e0b29ef, 0xd3832d74, 0xa8292467, 0x41ca1e45, 0x02ce7977},
-       {0x3e672d5b, 0x25ee10aa, 0x28597504, 0xb0e60c63, 0xe263c827, 0x4a8d0567, 0xfadefeba, 0x01f4ec42},
-       {0xa5a26158, 0x8b4b15e0, 0x88a71cf2, 0xa59b2df9, 0x5d734341, 0xde44f2e7, 0x4db8d2e8, 0x007a18a0},
-       {0xb4d18100, 0x30fcf001, 0xf8ae0b4f, 0xcdaa5334, 0xe325615a, 0x67017b2b, 0xf0ccbf57, 0x016c6d47},
-       {0xba937732, 0x66afc115, 0xc20be386, 0x917d4890, 0xa017c59d, 0x5dadccff, 0x986c39c1, 0x043fa44e},
-       {0x08baa72a, 0xc57ec886, 0x052364ed, 0xe65a4680, 0x85f9a523, 0x0536b505, 0xfe744ee2, 0x03580609},
-       {0x1bab1ab8, 0x88109415, 0x62f0fa74, 0x02244b19, 0x915618e0, 0x837fcd10, 0x942f12d2, 0x061b83d0},
-       {0x687b7798, 0x823d0bba, 0x84a49784, 0x5f93174a, 0x2574af37, 0xcfd64159, 0xe108057c, 0x0290722e},
-       {0x58a66036, 0x900a7031, 0x6153c2ae, 0xcb443378, 0xa6ccdffe, 0x4c48b8dd, 0xa06e955a, 0x049a9211},
-       {0xea0b9dd9, 0x1b034532, 0x638c79ec, 0x11cba08f, 0x7c5b2d15, 0x16d00728, 0xbb9a759c, 0x05abcbcd},
-       {0x1552d6af, 0x21b4f60e, 0xbed54865, 0x2f7ea9d2, 0x738befdb, 0x39378802, 0x97845360, 0x02adf76c},
-       {0x4026bb92, 0x6e5eb2ca, 0xcbed5570, 0x18f3d8bf, 0xb655ac26, 0x2a5fc8cd, 0x3809a1c5, 0x0031cd25},
-       {0x0ef5e011, 0x2d698950, 0xc018b82d, 0xc0668c45, 0xf520d325, 0xd180ff47, 0xa38122b1, 0x046714c7},
-       {0x12df2cc7, 0x8dec8a4b, 0x963031f8, 0x5eb84a1b, 0x88525708, 0xb75ad701, 0x07df57bd, 0x02054a99},
-       {0x82b2f616, 0xe0013d43, 0x7b385914, 0x2ad34c97, 0x11108f4b, 0xc9969223, 0x9c9fad59, 0x0183f639},
-       {0x06b4dc38, 0xaca9dfbc, 0x962d5774, 0x85596bbc, 0x22f1cd7d, 0xd7023923, 0x2067b180, 0x04d3c939},
-       {0xe4004173, 0x6d13e6ab, 0xaafe8726, 0x3495d095, 0x33dc3303, 0xa22d3e4a, 0x776d2e14, 0x0276dbb2},
-       {0x68c539b6, 0xa03f83cb, 0x7b42a06e, 0xfd3fa839, 0xe8d45ac3, 0xea0f1f15, 0xa414b012, 0x061adb94},
-       {0xb33fb188, 0xd22fc6e3, 0xf723dc18, 0xbebc7978, 0xf6c99f34, 0xa874b584, 0xf67ff454, 0x049beb53},
-       {0x754bed16, 0x7c247948, 0xe50eac10, 0x4a84bcfb, 0xade97580, 0xc00d65df, 0xca79c5ae, 0x0763d73c},
-       {0x7aadbe1a, 0x696e27af, 0x9d8e2a1f, 0x113535e0, 0x4c011766, 0x6953003f, 0xbb52558c, 0x0498a75f},
-       {0x6e09cee7, 0xcf26e897, 0x299b63c7, 0x813a76f2, 0x0939904c, 0x67c02fa7, 0x7e0b9483, 0x045c41a9},
-       {0x4af5adcc, 0xad979914, 0xc2c7c068, 0x7d9267f9, 0x21b4a0a7, 0xda4fa3f8, 0x3386c423, 0x03f4bcc9},
-       {0xd1228595, 0xe5fcd634, 0x12fc8b7c, 0x5571b994, 0x244857f8, 0xd50dcd33, 0x263b93f0, 0x060dc1d6},
-       {0xfee59c89, 0x7040a236, 0x78ceb168, 0x91a4301b, 0x19cdb36a, 0x973b55bd, 0x71008400, 0x06a1c58e},
-       {0x6af1f351, 0x1d3c7ad7, 0xe8ad24dc, 0x8493c0c1, 0x48d5ffd9, 0x076f9dea, 0x5931555f, 0x00b9b2bf},
-       {0xeaa5731c, 0xa3d54d89, 0xba84ee02, 0xfcc41a45, 0xcc1cdac8, 0x7c828f73, 0x5bfe9d23, 0x009c426b},
-       {0x3f1f352c, 0x36fb314c, 0x9feb1120, 0x750a2a5f, 0xd7b06171, 0x3a2f19e8, 0x3b550cd9, 0x06de1885},
-       {0xb69183f6, 0xefc03237, 0x979ee075, 0xb5a14fc3, 0x2dcb1d51, 0xbf114125, 0xb8eca2d3, 0x062364f7},
-       {0x95375861, 0x575f1ea7, 0x80cc8dba, 0x30608586, 0xcf7a8f9f, 0x2beca9f5, 0x5fe60da4, 0x00dfc078},
-       {0x0f86ded5, 0x312928eb, 0xb9c4f0cc, 0x646f5d3e, 0x2fbf14dd, 0x23c69382, 0xc44caa0e, 0x023aae90},
-       {0x13e16243, 0xa7c92faf, 0x92efd5fc, 0x035a3e75, 0x86a744ea, 0x32f44d08, 0x1ea28333, 0x05b45217},
-       {0xc41fdf22, 0xb557d203, 0x4bbc8f76, 0x9697570c, 0x81eaf742, 0x3a6a2cb5, 0xb0d03a0f, 0x07f2c08a},
-       {0x2a18b73a, 0xca806385, 0xdb6a953d, 0xf2015d6d, 0xba5f67b9, 0x51d21a8e, 0x14807dd6, 0x051439d5},
-       {0xf75051de, 0x7b6e0c13, 0x14dd1aa0, 0x114681fb, 0x0fd95a37, 0x72a1cccc, 0xa39e5bb8, 0x02f29d4c},
-       {0x116529cd, 0x4808a0de, 0x5b941d1c, 0x1cf38580, 0xd70796f7, 0xc96a451e, 0x3f24e64f, 0x016d083f},
-       {0x3cf155ee, 0xc71b78d0, 0x0c361b67, 0x0c04a134, 0x7756e4a9, 0xdb546edc, 0x2988eb2c, 0x03474404},
-       {0xf30cef17, 0x1a0b3585, 0x864abd80, 0x63c1de29, 0xc0687c8e, 0x0c171d6e, 0xc9763a97, 0x0353aec8},
-       {0x94192fb8, 0x0a2c9cff, 0x1a7f5bbf, 0x27320b93, 0xe5ceeb75, 0x465d2f9f, 0xd78f1cc3, 0x07ce6f99},
-       {0xe8d1b26d, 0x0f899233, 0xb87a2984, 0xed4b44d2, 0x0bd6354a, 0x0c0712c6, 0xc7032f5c, 0x01eb2a31},
-       {0x46b03b57, 0xc4c03fbd, 0x785ebbe8, 0x989b0ff3, 0x7f0bcb19, 0x5cada62a, 0xa97557c9, 0x01426410},
-       {0x96fb0a26, 0xf1d2e82b, 0x1edb9ce3, 0xe270bc10, 0xfc7aaed8, 0x9549cfd0, 0xd90d7c9c, 0x03e8256c},
-       {0x43ac9984, 0x14eef0ee, 0xa16d6770, 0x2903ff22, 0xa38fbfc0, 0xc66c2690, 0x8755440e, 0x0032a202},
-       {0xf3601782, 0x46a07cf2, 0xaa71d137, 0x79f410f9, 0x8bcabc59, 0xc320c6f1, 0xf8ab64d8, 0x00a706cf},
-       {0x8dbd8d4f, 0x8848a9f0, 0x0085061d, 0xeff89e69, 0xfee62fbe, 0x90e634a7, 0x2ffb456b, 0x03983046},
-       {0xb272ed5c, 0x91ec28a8, 0xdc0cbb77, 0xf8529918, 0x3648d2c5, 0x8f896ddb, 0x74edaf19, 0x0668a86c},
-       {0x128c9bd9, 0x341d5fc8, 0x6b3241c5, 0x592f87d8, 0xb2cc3c97, 0xf8cba6f2, 0x03f396ed, 0x03463bf1},
-       {0xafd9d239, 0xcf3ae525, 0xea20b753, 0x06b8b7b9, 0x3408a993, 0xb2be1e49, 0x9f47063f, 0x02bcb200},
-       {0xa0bd0bc8, 0x7ca02722, 0xb862774d, 0xce8b32ee, 0x5f8da059, 0x424ba5f0, 0x3bb422a0, 0x05c81961},
-       {0x32fd8907, 0x137dad8c, 0xc95a3a5d, 0x301d5119, 0x8937ac08, 0x144b38c3, 0x39338de7, 0x00e66f0e},
-       {0xcfc10885, 0xe68b8875, 0x96147e68, 0x4f24d49a, 0x43032c15, 0x5da9e6fd, 0x9bf25e12, 0x061ab0e6},
-       {0x455c65ad, 0xeab29bbd, 0x2448be64, 0x1c7da0e7, 0x8eedfa1f, 0x8c2c1bcd, 0x698c1197, 0x0400e2d2},
-       {0x04549c13, 0x335d3e9e, 0xd31585cc, 0x546f0d82, 0xe16dbbac, 0x350d5ed5, 0x113c53fd, 0x05f77544},
-       {0x7d8f3b7e, 0x6aa75c04, 0x10a641ae, 0xc70851dd, 0x9a0750fe, 0x4d33edd4, 0xcd1b230f, 0x022802cf},
-       {0xef8170e3, 0x59fa1903, 0x62995788, 0x464a73ef, 0x13369717, 0x338be7fd, 0x52d21278, 0x02e97589},
-       {0x4856ddd5, 0x3f2deca8, 0xfced10e2, 0x969b10e2, 0x52860ee7, 0x09620dde, 0xb620fa3f, 0x04a169bf},
-       {0xa03b49f1, 0xd9beb712, 0xe9af606e, 0x0798af09, 0x63e70b9a, 0xe37f9aea, 0xb35abd7c, 0x02542a44},
-       {0xf6e78973, 0x335d4000, 0x76f1bb23, 0x7bc28fde, 0x1b30e9ca, 0x6cfdc907, 0x0400b651, 0x03ff88aa},
-       {0x36433eaf, 0xfb862981, 0x4111cfa3, 0x15fdc659, 0xeab2909d, 0x569574b9, 0x3cd80f84, 0x01442360},
-       {0xe85c4af3, 0xa8ed8f31, 0xe6aaf3da, 0xf7680fee, 0xc5c1772c, 0x2240e931, 0xaebeeb70, 0x04f44f6f},
-       {0x8846e0af, 0x29de323f, 0x42c25319, 0x33f91593, 0x6cbadd58, 0x863099c1, 0xfd83e5b3, 0x06a603cf},
-       {0x86c77703, 0x1bdd17f3, 0xe02db671, 0x8cee8e78, 0x0b6dffce, 0xed1627af, 0xa0d9b3cc, 0x04491984},
-       {0xcb583661, 0x177f8f9c, 0x73d05bfc, 0x54122d0c, 0xebe37b4a, 0xa9231660, 0xd4826038, 0x06e885db},
-       {0x13c253b9, 0x64cde875, 0x2fbc98a9, 0x8484bccb, 0x4885a9af, 0xbad877c5, 0x0cbc33b6, 0x03007c90},
-       {0x47cfa357, 0x41eb9173, 0x325309ad, 0xb3f06289, 0xaa85421b, 0x029da7c1, 0x84de4bd4, 0x07b7eb0d},
-       {0x56b831e2, 0x2c459a80, 0x321aba19, 0x2b99d098, 0xea73c0e1, 0x96237364, 0xe25ed0ed, 0x02f2c638},
-       {0x9b388bf4, 0xfc8c3228, 0x82cd081d, 0xa4c371e4, 0xc85f75df, 0x11239026, 0x8892896e, 0x01f01c5e},
-       {0x73457917, 0xce1dde59, 0x16dd8b49, 0xdfdaeb19, 0xbfd17b1e, 0x4289a976, 0xc842870a, 0x05e2cf7e},
-       {0xc7705532, 0x72faa825, 0x8f7fe8c2, 0xd24bf942, 0xb695e31b, 0xb7403e13, 0xfc85a0c6, 0x02eac9e7},
-       {0x1ddb2dff, 0xc47638e3, 0x799bb649, 0x78b91a13, 0x552588ed, 0x001800de, 0x9cd9425c, 0x01d0640c},
-       {0xfb431e10, 0x159891e7, 0xa012b461, 0x2f2fb29a, 0xb3333e5d, 0xc1dca804, 0x9a47200d, 0x05b918ec},
-       {0x2d5ce760, 0x379119b5, 0xda2ccdab, 0xf9911f75, 0x47b5c054, 0x92b09490, 0x7298d065, 0x0742a31e},
-       {0x4a73d1f1, 0xe2a1046b, 0xc6ab4d9c, 0xbc85a747, 0xba0701f8, 0x79b0e699, 0xeebc6762, 0x05e5c2cb},
-       {0xe0c0db50, 0xdc644b37, 0x2b8444d2, 0x26f7f083, 0x63479a84, 0x90acf2e7, 0x90ffe372, 0x0590d880},
-       {0x83c0fc9c, 0x3dd1aba4, 0xcfb43020, 0x30a1051f, 0xaf5be716, 0x7d1ca380, 0x1ed8aed9, 0x01d56947},
-       {0x0fa23690, 0x657df8c4, 0x32111be3, 0x61a12fe4, 0xe78236c9, 0xd6cc9942, 0x85e66191, 0x01709635},
-       {0xc6a054f0, 0x96bf35ed, 0x004113cc, 0x9d1e411a, 0x1ac7a3ec, 0xccdb9bc3, 0xd08016b8, 0x07362425},
-       {0x9721b035, 0x72744cce, 0x0beb72e3, 0xb87eb606, 0x60870c2e, 0x00c5e70c, 0x685d7c14, 0x029fa4d3},
-       {0x86e52af4, 0x06d3a7a3, 0x70020878, 0x7b1c814a, 0x52e68007, 0x44373cb7, 0xe403540f, 0x041cf8c0},
-       {0x76a27949, 0xd5dbc8bf, 0x27d9cd12, 0xb41449bc, 0xa7a667a1, 0x93740020, 0x0fbb4e77, 0x000bf807},
-       {0x9969cfe9, 0x274ce281, 0x259ec27c, 0x3234d283, 0xe0b44f04, 0x9ff85b71, 0xffcc1006, 0x0298d060},
-       {0x68ab54f8, 0x5cd8b289, 0x437eaab8, 0x42e3877f, 0x9318bd3e, 0x6490dc61, 0x4e54d968, 0x075b01f3},
-       {0x7b64243c, 0x73100d65, 0x5c802f82, 0x692378be, 0x88184c0c, 0x00283dbb, 0xab6f4f0e, 0x0442efad},
-       {0x72015722, 0xbe83b708, 0xe1cdcf0e, 0x2035319f, 0x398347da, 0x2b1b3351, 0x1a14b8dc, 0x061823d8},
-       {0x378d9803, 0x1090948c, 0x4725c64b, 0x61a558cc, 0x7d7fcd91, 0x9e5bd3b5, 0x57ebda25, 0x061e02a0},
-       {0xf8324dc8, 0x166b4a3c, 0x38133fda, 0xa25b9d11, 0x917171a5, 0x9d602950, 0x417d104e, 0x0632e48b},
-       {0x6a61d5e0, 0x03b9f1b9, 0xe59cfbb7, 0xd906b740, 0x7892fbe4, 0x99a93267, 0xad1b8171, 0x06ddc2a6},
-       {0x67fc3874, 0x6ae4355d, 0xb1ada695, 0x4fa456d8, 0x9f91ac43, 0x4e234065, 0x829d173e, 0x028da309},
-       {0xfc695c2c, 0x1e08dd18, 0xfa687112, 0x1c0a2fad, 0xffd6302a, 0xeb5ebf01, 0xfd1d10f5, 0x012fd387},
-       {0x236e65c9, 0x0b907f2e, 0xb1281d54, 0x92ba7a15, 0xc13f1d75, 0x07f0a6ad, 0xcd6d1e9c, 0x05dfe4e3},
-       {0xc45f33f8, 0xd99cc41a, 0xd373165c, 0xc1c10a71, 0x2ce2936a, 0x6c809230, 0xa0498cf5, 0x018dc832},
-       {0x7b222ad8, 0x8e881eab, 0xb6194efb, 0xc8b48774, 0x963c6b6b, 0x38452dfd, 0xe4c4e0f8, 0x02847f5a},
-       {0x2bf4ad95, 0x2950bb4a, 0xdc39ffb0, 0x37f42c9b, 0x101253a8, 0x3814fa42, 0xb67f2ca5, 0x04d4a34c},
-       {0xa9684ba0, 0x6c40fece, 0x3b13bca4, 0xc7108aad, 0xe7bff9be, 0x98ccc7ea, 0xe9b3b316, 0x048b3a6a},
-       {0x08390a2b, 0x4d908260, 0x74b070bc, 0xd5a641d0, 0x910015c5, 0xc3b19274, 0xd5a998a7, 0x02ac8e74},
-       {0x9698d605, 0x8de03acc, 0xa4c9137f, 0x3b8b720c, 0x354faf46, 0x5bbad6e4, 0xfd9e842f, 0x0054c120},
-       {0xd65aead5, 0x305fa33f, 0x0fe296f9, 0xba02b164, 0x708efc94, 0x64cba43c, 0x8ad7f0ef, 0x034b9ffe},
-       {0x13c2e8f4, 0x59e1179e, 0xc572f8a8, 0x5d823d59, 0x74003bce, 0x0cfdb6ee, 0x011c179e, 0x00763941},
-       {0xa47999a8, 0x29b692ee, 0xbfcd80d8, 0x6436c3f1, 0x959768d7, 0x553444f3, 0x583896d4, 0x01d45a26},
-       {0xc150b3f8, 0x0ce0791d, 0xf493c135, 0x7d3a0c1f, 0x5ede0712, 0x4d37cc23, 0x34fbae9c, 0x036a6a38},
-       {0x2ca1eb78, 0xa8ee8204, 0x66d8b759, 0xc713a1dc, 0xac061800, 0x1813508d, 0x3b1f0da2, 0x05725ca0},
-       {0xf2f391c1, 0xbe6826df, 0x232878f0, 0xeb85b046, 0xf7e1d662, 0xf5a96510, 0xe38c2b64, 0x0419a43b},
-       {0xe69e791b, 0x4b54889b, 0xb5c95ea5, 0xb371eeb0, 0x0b2f26a3, 0x9f53ccca, 0x66f45f71, 0x0040592d},
-       {0xad2e5d5b, 0x4ced12db, 0x0987b849, 0x5f57b16d, 0xd9ec045b, 0xcab0e2e9, 0x6cfbf4df, 0x03e4e405},
-       {0x3ecb72a4, 0xd71a1eee, 0x03a13fb7, 0x6bd9f7ec, 0x5877c6c7, 0xb74a54c8, 0xa28236a5, 0x0377689b},
-       {0x74b3354c, 0x6f558a20, 0x3f776b18, 0xb67f6d10, 0x01165ed8, 0x8c447df2, 0xf3889308, 0x056b8991},
-       {0x0d306b7a, 0x9482eb10, 0xd441cd03, 0xdd738e0f, 0x2de5dfd7, 0x6d186de5, 0x75fd1833, 0x00781b3e},
-       {0x77ec28e5, 0xdbc14748, 0xd26e050c, 0x02ceee41, 0x18457c96, 0x8e5aef74, 0x1823c60f, 0x0461a6e2},
-       {0x2be17c8b, 0x172e551d, 0x49c6a7b8, 0x90e25fa2, 0xa1b3478f, 0x6219e63e, 0xd063a517, 0x00c412f8},
-       {0x65a9b68e, 0xb136b848, 0x673c6cbc, 0x9a9b7169, 0xf8ec7473, 0x15fa1875, 0x3033a5d6, 0x022d72f6}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000, 0x00000008, 0x04000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xc0000000, 0x0000000c, 0x06000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xe0000000, 0x0000000e, 0x07000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xf0000000, 0x0000000f, 0x07800000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x78000000, 0x00000010, 0x07c00000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xbc000000, 0x00000010, 0x07e00000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xde000000, 0x00000010, 0x07f00000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xef000000, 0x00000010, 0x07f80000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xf7800000, 0x00000010, 0x07fc0000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfbc00000, 0x00000010, 0x07fe0000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfde00000, 0x00000010, 0x07ff0000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfef00000, 0x00000010, 0x07ff8000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff780000, 0x00000010, 0x07ffc000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffbc0000, 0x00000010, 0x07ffe000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffde0000, 0x00000010, 0x07fff000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffef0000, 0x00000010, 0x07fff800},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfff78000, 0x00000010, 0x07fffc00},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffbc000, 0x00000010, 0x07fffe00},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffde000, 0x00000010, 0x07ffff00},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffef000, 0x00000010, 0x07ffff80},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffff7800, 0x00000010, 0x07ffffc0},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffbc00, 0x00000010, 0x07ffffe0},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffde00, 0x00000010, 0x07fffff0},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffef00, 0x00000010, 0x07fffff8},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffff780, 0x00000010, 0x07fffffc},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffbc0, 0x00000010, 0x07fffffe},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffde0, 0x00000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xfffffef0, 0x80000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffff78, 0xc0000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffbc, 0xe0000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffde, 0xf0000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffef, 0xf8000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x80000000, 0xfffffff7, 0xfc000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xc0000000, 0xfffffffb, 0xfe000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xe0000000, 0xfffffffd, 0xff000010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xf0000000, 0xfffffffe, 0xff800010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x78000000, 0xffffffff, 0xffc00010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xbc000000, 0xffffffff, 0xffe00010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xde000000, 0xffffffff, 0xfff00010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xef000000, 0xffffffff, 0xfff80010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xf7800000, 0xffffffff, 0xfffc0010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffe0010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfde00000, 0xffffffff, 0xffff0010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfef00000, 0xffffffff, 0xffff8010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xff780000, 0xffffffff, 0xffffc010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffe010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffde0000, 0xffffffff, 0xfffff010, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff810, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffc10, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffe10, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffde000, 0xffffffff, 0xffffff10, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff90, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffd0, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffbc00, 0xffffffff, 0xfffffff0, 0x07ffffff},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffde00, 0xffffffff, 0x00000000, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffef00, 0xffffffff, 0x00000008, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffff780, 0xffffffff, 0x0000000c, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffbc0, 0xffffffff, 0x0000000e, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffde0, 0xffffffff, 0x0000000f, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xfffffef0, 0x7fffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffff78, 0xbfffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffbc, 0xdfffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffde, 0xefffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffef, 0xf7ffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x80000000, 0xfffffff7, 0xfbffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xc0000000, 0xfffffffb, 0xfdffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xe0000000, 0xfffffffd, 0xfeffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xf0000000, 0xfffffffe, 0xff7fffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0x78000000, 0xffffffff, 0xffbfffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xbc000000, 0xffffffff, 0xffdfffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xde000000, 0xffffffff, 0xffefffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xef000000, 0xffffffff, 0xfff7ffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xf7800000, 0xffffffff, 0xfffbffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffdffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfde00000, 0xffffffff, 0xfffeffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfef00000, 0xffffffff, 0xffff7fff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xff780000, 0xffffffff, 0xffffbfff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffdfff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffde0000, 0xffffffff, 0xffffefff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff7ff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffbff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffdff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffde000, 0xffffffff, 0xfffffeff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff7f, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffbf, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffbc00, 0xffffffff, 0xffffffdf, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffde00, 0xffffffff, 0xffffffef, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffef00, 0xffffffff, 0xfffffff7, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffff780, 0xffffffff, 0xfffffffb, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffffde0, 0xffffffff, 0xfffffffe, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xfffffef0, 0x7fffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffff78, 0xbfffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffffbc, 0xdfffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffffde, 0xefffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x00000000, 0xffffffef, 0xf7ffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x80000000, 0xfffffff7, 0xfbffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xc0000000, 0xfffffffb, 0xfdffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xe0000000, 0xfffffffd, 0xfeffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xf0000000, 0xfffffffe, 0xff7fffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0x78000000, 0xffffffff, 0xffbfffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xbc000000, 0xffffffff, 0xffdfffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xde000000, 0xffffffff, 0xffefffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xef000000, 0xffffffff, 0xfff7ffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xf7800000, 0xffffffff, 0xfffbffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfbc00000, 0xffffffff, 0xfffdffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfde00000, 0xffffffff, 0xfffeffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfef00000, 0xffffffff, 0xffff7fff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xff780000, 0xffffffff, 0xffffbfff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffbc0000, 0xffffffff, 0xffffdfff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffde0000, 0xffffffff, 0xffffefff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffef0000, 0xffffffff, 0xfffff7ff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfff78000, 0xffffffff, 0xfffffbff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffbc000, 0xffffffff, 0xfffffdff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffde000, 0xffffffff, 0xfffffeff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffef000, 0xffffffff, 0xffffff7f, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffff7800, 0xffffffff, 0xffffffbf, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffbc00, 0xffffffff, 0xffffffdf, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffde00, 0xffffffff, 0xffffffef, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffef00, 0xffffffff, 0xfffffff7, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffff780, 0xffffffff, 0xfffffffb, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffffde0, 0xffffffff, 0xfffffffe, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xfffffef0, 0x7fffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffff78, 0xbfffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffffbc, 0xdfffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffffde, 0xefffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x00000000, 0xffffffef, 0xf7ffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x80000000, 0xfffffff7, 0xfbffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xc0000000, 0xfffffffb, 0xfdffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xe0000000, 0xfffffffd, 0xfeffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xf0000000, 0xfffffffe, 0xff7fffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0x78000000, 0xffffffff, 0xffbfffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xbc000000, 0xffffffff, 0xffdfffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xde000000, 0xffffffff, 0xffefffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xef000000, 0xffffffff, 0xfff7ffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xf7800000, 0xffffffff, 0xfffbffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfbc00000, 0xffffffff, 0xfffdffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfde00000, 0xffffffff, 0xfffeffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfef00000, 0xffffffff, 0xffff7fff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xff780000, 0xffffffff, 0xffffbfff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffbc0000, 0xffffffff, 0xffffdfff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffde0000, 0xffffffff, 0xffffefff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffef0000, 0xffffffff, 0xfffff7ff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfff78000, 0xffffffff, 0xfffffbff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffbc000, 0xffffffff, 0xfffffdff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffde000, 0xffffffff, 0xfffffeff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffef000, 0xffffffff, 0xffffff7f, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffff7800, 0xffffffff, 0xffffffbf, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffbc00, 0xffffffff, 0xffffffdf, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffde00, 0xffffffff, 0xffffffef, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffef00, 0xffffffff, 0xfffffff7, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffff780, 0xffffffff, 0xfffffffb, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffffbc0, 0xffffffff, 0xfffffffd, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffffde0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xfffffef0, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffff78, 0xbfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffffbc, 0xdfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffffde, 0xefffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x00000001, 0xffffffef, 0xf7ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x80000001, 0xfffffff7, 0xfbffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xc0000001, 0xfffffffb, 0xfdffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xe0000001, 0xfffffffd, 0xfeffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xf0000001, 0xfffffffe, 0xff7fffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0x78000001, 0xffffffff, 0xffbfffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xbc000001, 0xffffffff, 0xffdfffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xde000001, 0xffffffff, 0xffefffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xef000001, 0xffffffff, 0xfff7ffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xf7800001, 0xffffffff, 0xfffbffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfbc00001, 0xffffffff, 0xfffdffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfde00001, 0xffffffff, 0xfffeffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfef00001, 0xffffffff, 0xffff7fff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xff780001, 0xffffffff, 0xffffbfff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffbc0001, 0xffffffff, 0xffffdfff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffde0001, 0xffffffff, 0xffffefff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffef0001, 0xffffffff, 0xfffff7ff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfff78001, 0xffffffff, 0xfffffbff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffbc001, 0xffffffff, 0xfffffdff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffde001, 0xffffffff, 0xfffffeff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffef001, 0xffffffff, 0xffffff7f, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffff7801, 0xffffffff, 0xffffffbf, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffbc01, 0xffffffff, 0xffffffdf, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffde01, 0xffffffff, 0xffffffef, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffef01, 0xffffffff, 0xfffffff7, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffff781, 0xffffffff, 0xfffffffb, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffffbc1, 0xffffffff, 0xfffffffd, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffffde1, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffffef1, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffff79, 0xbfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffffbd, 0xdfffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xffffffdf, 0xefffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000},
-       {0xfffffff0, 0xf7ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000010, 0x08000000}}};
+    static constexpr storage<8> rou = {0x42f8ef94, 0x6070024f, 0xe11a6161, 0xad187148,
+                                       0x9c8b0fa5, 0x3f046451, 0x87529cfa, 0x005282db};
+    TWIDDLES(modulus, rou)
  };

  /**
--- a/icicle/include/gpu-utils/device_context.cuh
+++ b/icicle/include/gpu-utils/device_context.cuh
@@ -1,20 +1,21 @@
 #pragma once
 #ifndef DEVICE_CONTEXT_H
 #define DEVICE_CONTEXT_H
-#include <cstddef>
+
+#include <cuda_runtime.h>
+#include "gpu-utils/error_handler.cuh"

 namespace device_context {

-  size_t MAX_DEVICES = 32;
-  
+  constexpr std::size_t MAX_DEVICES = 32;

  /**
   * Properties of the device used in icicle functions.
   */
  struct DeviceContext {
-    int stream;  /**< Stream to use. Default value: 0. */
+    cudaStream_t& stream;  /**< Stream to use. Default value: 0. */
    std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
-    int mempool; /**< Mempool to use. Default value: 0. */
+    cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
  };

  /**
@@ -22,14 +23,36 @@ namespace device_context {
   */
  inline DeviceContext get_default_device_context() // TODO: naming convention ?
  {
-    static int default_stream = 0;
+    static cudaStream_t default_stream = (cudaStream_t)0;
    return DeviceContext{
-      default_stream, // stream
+      (cudaStream_t&)default_stream, // stream
      0,                             // device_id
      0,                             // mempool
    };
  }

-} // namespace device_context
+  // checking whether a pointer is on host or device and asserts device matches provided device
+  static bool is_host_ptr(const void* p, int device_id = 0)
+  {
+    cudaPointerAttributes attributes;
+    CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
+    const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
+                            attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
+    const bool is_on_cur_device = !is_on_host && attributes.device == device_id;
+    const bool is_valid_ptr = is_on_host || is_on_cur_device;
+    if (!is_valid_ptr) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid ptr"); }

+    return is_on_host;
+  }
+
+  static int get_cuda_device(const void* p)
+  {
+    cudaPointerAttributes attributes;
+    CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
+    const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
+                            attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
+    return is_on_host ? -1 : attributes.device;
+  }
+
+} // namespace device_context
 #endif
--- a/icicle/include/gpu-utils/error_handler.cuh
+++ b/icicle/include/gpu-utils/error_handler.cuh
@@ -3,10 +3,12 @@
 #define ERR_H

 #include <iostream>
+
+#include <cuda_runtime.h>
 #include <stdexcept>
 #include <string>

-enum IcicleError_t {
+enum class IcicleError_t {
  IcicleSuccess = 0,
  InvalidArgument = 1,
  MemoryAllocationError = 2,
@@ -36,14 +38,14 @@ private:

 public:
  // Constructor for cudaError_t with optional message
-  IcicleError(int cudaError, const std::string& msg = "")
-      : std::runtime_error("Error: " + msg),
+  IcicleError(cudaError_t cudaError, const std::string& msg = "")
+      : std::runtime_error("CUDA Error: " + std::string(cudaGetErrorString(cudaError)) + " " + msg),
        errCode(static_cast<int>(cudaError))
  {
  }

  // Constructor for cudaError_t with const char* message
-  IcicleError(int cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}
+  IcicleError(cudaError_t cudaError, const char* msg) : IcicleError(cudaError, std::string(msg)) {}

  // Constructor for IcicleError_t with optional message
  IcicleError(IcicleError_t icicleError, const std::string& msg = "")
@@ -65,10 +67,11 @@ public:
 #define CHK_LOG(val)                   check((val), #val, __FILE__, __LINE__)
 #define CHK_VAL(val, file, line)       check((val), #val, file, line)

-int inline check(int err, const char* const func, const char* const file, const int line)
+cudaError_t inline check(cudaError_t err, const char* const func, const char* const file, const int line)
 {
-  if (err != 0) {
+  if (err != cudaSuccess) {
    std::cerr << "CUDA Runtime Error by: " << func << " at: " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(err) << std::endl << std::endl;
  }

  return err;
@@ -87,12 +90,12 @@ int inline check(int err, const char* const func, const char* const file, const
 #define THROW_ICICLE_CUDA(val)                       throwIcicleCudaErr(val, __FUNCTION__, __FILE__, __LINE__)
 #define THROW_ICICLE_CUDA_ERR(val, func, file, line) throwIcicleCudaErr(val, func, file, line)
 void inline throwIcicleCudaErr(
-  int err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
+  cudaError_t err, const char* const func, const char* const file, const int line, bool isUnrecoverable = true)
 {
  // TODO: fmt::format introduced only in C++20
-  std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "");
-  //  + " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
-  //                       "\nThe error is reported there and may be caused by prior calls.\n";
+  std::string err_msg = (isUnrecoverable ? "!!!Unrecoverable!!! : " : "") + std::string{cudaGetErrorString(err)} +
+                        " : detected by: " + func + " at: " + file + ":" + std::to_string(line) +
+                        "\nThe error is reported there and may be caused by prior calls.\n";
  std::cerr << err_msg << std::endl; // TODO: Logging
  throw IcicleError{err, err_msg};
 }
@@ -108,14 +111,14 @@ void inline throwIcicleErr(
  throw IcicleError{err, err_msg};
 }

-int inline checkCudaErrorIsSticky(
-  int err, const char* const func, const char* const file, const int line, bool isThrowing = true)
+cudaError_t inline checkCudaErrorIsSticky(
+  cudaError_t err, const char* const func, const char* const file, const int line, bool isThrowing = true)
 {
-  if (err != 0) {
+  if (err != cudaSuccess) {
    // check for sticky (unrecoverable) error when the only option is to restart process
-    int err2 = 0;
+    cudaError_t err2 = cudaDeviceSynchronize();
    bool is_logged;
-    if (err2 != 0) { // we suspect sticky error
+    if (err2 != cudaSuccess) { // we suspect sticky error
      if (err != err2) {
        is_logged = true;
        CHK_ERR(err, func, file, line);
@@ -136,13 +139,13 @@ int inline checkCudaErrorIsSticky(
 // most common macros to use
 #define CHK_INIT_IF_RETURN()                                                                                           \
  {                                                                                                                    \
-    int err_result = CHK_LAST();                                                                               \
+    cudaError_t err_result = CHK_LAST();                                                                               \
    if (err_result != cudaSuccess) return err_result;                                                                  \
  }

 #define CHK_IF_RETURN(val)                                                                                             \
  {                                                                                                                    \
-    int err_result = CHK_STICKY(val);                                                                          \
+    cudaError_t err_result = CHK_STICKY(val);                                                                          \
    if (err_result != cudaSuccess) return err_result;                                                                  \
  }

--- a/icicle/include/gpu-utils/modifiers.cuh
+++ b/icicle/include/gpu-utils/modifiers.cuh
@@ -6,6 +6,6 @@
 #define UNROLL       #pragma unroll
 #endif

-// #define        __host__ INLINE_MACRO
-// #define      INLINE_MACRO
-// #define __host__ INLINE_MACRO
+#define HOST_INLINE        __host__ INLINE_MACRO
+#define DEVICE_INLINE      __device__ INLINE_MACRO
+#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
--- a/icicle/include/gpu-utils/sharedmem.cuh
+++ b/icicle/include/gpu-utils/sharedmem.cuh
@@ -24,7 +24,7 @@
 * definitions.
 *
 * To use dynamically allocated shared memory in a templatized __global__ or
- * function, just replace code like this:
+ * __device__ function, just replace code like this:
 *
 * <pre>
 *  template<class T>
@@ -32,7 +32,7 @@
 *  foo( T* d_out, T* d_in)
 *  {
 *      // Shared mem size is determined by the host app at run time
- *       T sdata[];
+ *      extern __shared__  T sdata[];
 *      ...
 *      doStuff(sdata);
 *      ...
@@ -62,7 +62,7 @@
 *
 * This struct uses template specialization on the type \a T to declare
 * a differently named dynamic shared memory array for each type
- * (\code T s_type[] \endcode).
+ * (\code extern __shared__ T s_type[] \endcode).
 *
 * Currently there are specializations for the following types:
 * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
@@ -73,10 +73,11 @@ template <typename T>
 struct SharedMemory {
  //! @brief Return a pointer to the runtime-sized shared memory array.
  //! @returns Pointer to runtime-sized shared memory array
-  T* getPointer()
+  __device__ T* getPointer()
  {
-    T* a = nullptr; // Initialize pointer to nullptr or allocate memory as needed
-    return a;
+    // extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+    // Error_UnsupportedType();
+    return (T*)0;
  }
  // TODO: Use operator overloading to make this class look like a regular array
 };
@@ -87,128 +88,129 @@ struct SharedMemory {

 template <>
 struct SharedMemory<int> {
-  int* getPointer()
+  __device__ int* getPointer()
  {
-    return 0;
+    extern __shared__ int s_int[];
+    return s_int;
  }
 };

 template <>
 struct SharedMemory<unsigned int> {
-  unsigned int* getPointer()
+  __device__ unsigned int* getPointer()
  {
-    return 0;
+    extern __shared__ unsigned int s_uint[];
+    return s_uint;
  }
 };

 template <>
 struct SharedMemory<char> {
-  char* getPointer()
+  __device__ char* getPointer()
  {
-    char *a = nullptr;
-    return a;
+    extern __shared__ char s_char[];
+    return s_char;
  }
 };

 template <>
 struct SharedMemory<unsigned char> {
-  unsigned char* getPointer()
+  __device__ unsigned char* getPointer()
  {
-    unsigned char* a = nullptr;
-    return a;
+    extern __shared__ unsigned char s_uchar[];
+    return s_uchar;
  }
 };

 template <>
 struct SharedMemory<short> {
-  short* getPointer()
+  __device__ short* getPointer()
  {
-    short* a = nullptr;
-    return a;
+    extern __shared__ short s_short[];
+    return s_short;
  }
 };

 template <>
 struct SharedMemory<unsigned short> {
-  unsigned short* getPointer()
+  __device__ unsigned short* getPointer()
  {
-    unsigned short* a = nullptr;
-    return a;
+    extern __shared__ unsigned short s_ushort[];
+    return s_ushort;
  }
 };

 template <>
 struct SharedMemory<long> {
-  long* getPointer()
+  __device__ long* getPointer()
  {
-    long *s_long = nullptr;
+    extern __shared__ long s_long[];
    return s_long;
  }
 };

 template <>
 struct SharedMemory<unsigned long> {
-  unsigned long* getPointer()
+  __device__ unsigned long* getPointer()
  {
-    unsigned long *s_ulong = nullptr;
+    extern __shared__ unsigned long s_ulong[];
    return s_ulong;
  }
 };

 template <>
 struct SharedMemory<long long> {
-  long long* getPointer()
+  __device__ long long* getPointer()
  {
-    long long *s_longlong;
+    extern __shared__ long long s_longlong[];
    return s_longlong;
  }
 };

 template <>
 struct SharedMemory<unsigned long long> {
-  unsigned long long* getPointer()
+  __device__ unsigned long long* getPointer()
  {
-    unsigned long long *s_ulonglong;
+    extern __shared__ unsigned long long s_ulonglong[];
    return s_ulonglong;
  }
 };

 template <>
 struct SharedMemory<bool> {
-  bool* getPointer()
+  __device__ bool* getPointer()
  {
-    bool *s_bool;
+    extern __shared__ bool s_bool[];
    return s_bool;
  }
 };

 template <>
 struct SharedMemory<float> {
-  float* getPointer()
+  __device__ float* getPointer()
  {
-    float *s_float;
+    extern __shared__ float s_float[];
    return s_float;
  }
 };

 template <>
 struct SharedMemory<double> {
-  double* getPointer()
+  __device__ double* getPointer()
  {
-    double *s_double;
+    extern __shared__ double s_double[];
    return s_double;
  }
 };

-
-// template <>
-// struct SharedMemory<uchar4> {
-//   uchar4* getPointer()
-//   {
-//     uchar4 *s_uchar4;
-//     return s_uchar4;
-//   }
-// };
+template <>
+struct SharedMemory<uchar4> {
+  __device__ uchar4* getPointer()
+  {
+    extern __shared__ uchar4 s_uchar4[];
+    return s_uchar4;
+  }
+};

 #endif //_SHAREDMEM_H_

--- a/icicle/include/hash/hash.cuh
+++ b/icicle/include/hash/hash.cuh
@@ -0,0 +1,174 @@
+#pragma once
+#ifndef HASH_H
+#define HASH_H
+
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "matrix/matrix.cuh"
+#include <cassert>
+
+using matrix::Matrix;
+
+/**
+ * @namespace hash
+ * Includes classes and methods for describing hash functions.
+ */
+namespace hash {
+
+  /**
+   * @struct HashConfig
+   * Encodes hash operations parameters.
+   */
+  struct HashConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool
+      are_outputs_on_device; /**< True if outputs are on device and false if they're on host. Default value: false. */
+    bool is_async; /**< Whether to run the hash operations asynchronously. If set to `true`, the functions will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false,
+                    *   functions will block the current CPU thread. */
+  };
+
+  /**
+   * A function that returns the default value of [HashConfig](@ref HashConfig) for the [Hasher](@ref
+   * Hasher) class.
+   * @return Default value of [HashConfig](@ref HashConfig).
+   */
+  static HashConfig
+  default_hash_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    HashConfig config = {
+      ctx,   // ctx
+      false, // are_inputs_on_device
+      false, // are_outputs_on_device
+      false, // is_async
+    };
+    return config;
+  }
+
+  /**
+   * @class Hasher
+   *
+   * An interface containing methods for hashing
+   *
+   * @tparam PreImage type of inputs elements
+   * @tparam Image type of state elements. Also used to describe the type of hash output
+   */
+  template <typename PreImage, typename Image>
+  class Hasher
+  {
+  public:
+    /// @brief the width of permutation state
+    const unsigned int width;
+
+    /// @brief how many elements a state can fit per 1 permutation. Used with domain separation.
+    const unsigned int preimage_max_length;
+
+    /// @brief portion of the state to absorb input into, or squeeze output from
+    const unsigned int rate;
+
+    /// @brief start squeezing from this offset. Used with domain separation.
+    const unsigned int offset;
+
+    Hasher(unsigned int width, unsigned int preimage_max_length, unsigned int rate, unsigned int offset)
+        : width(width), preimage_max_length(preimage_max_length), rate(rate), offset(offset)
+    {
+      assert(
+        rate * sizeof(PreImage) <= preimage_max_length * sizeof(Image) &&
+        "Input rate can not be bigger than preimage max length");
+    }
+
+    virtual cudaError_t hash_2d(
+      const Matrix<PreImage>* inputs,
+      Image* states,
+      unsigned int number_of_inputs,
+      unsigned int output_len,
+      uint64_t number_of_rows,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Absorb 2d is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    };
+
+    virtual cudaError_t compress_and_inject(
+      const Matrix<PreImage>* matrices_to_inject,
+      unsigned int number_of_inputs,
+      uint64_t number_of_rows,
+      const Image* prev_layer,
+      Image* next_layer,
+      unsigned int digest_elements,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Compress and inject is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    }
+
+    /// @param input pointer to input allocated on-device
+    /// @param out pointer to output allocated on-device
+    cudaError_t compress_many(
+      const Image* input,
+      Image* out,
+      unsigned int number_of_states,
+      unsigned int output_len,
+      const HashConfig& cfg) const
+    {
+      return hash_many((const PreImage*)input, out, number_of_states, width, output_len, cfg);
+    }
+
+    virtual cudaError_t run_hash_many_kernel(
+      const PreImage* input,
+      Image* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Hash many kernel is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    };
+
+    cudaError_t hash_many(
+      const PreImage* input,
+      Image* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const HashConfig& cfg) const
+    {
+      const PreImage* d_input;
+      PreImage* d_alloc_input;
+      Image* d_output;
+      if (!cfg.are_inputs_on_device) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, number_of_states * input_len * sizeof(PreImage), cfg.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          d_alloc_input, input, number_of_states * input_len * sizeof(PreImage), cudaMemcpyHostToDevice,
+          cfg.ctx.stream));
+        d_input = d_alloc_input;
+      } else {
+        d_input = input;
+      }
+
+      if (!cfg.are_outputs_on_device) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_output, number_of_states * output_len * sizeof(Image), cfg.ctx.stream));
+      } else {
+        d_output = output;
+      }
+
+      CHK_IF_RETURN(run_hash_many_kernel(d_input, d_output, number_of_states, input_len, output_len, cfg.ctx));
+
+      if (!cfg.are_inputs_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
+      if (!cfg.are_outputs_on_device) {
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          output, d_output, number_of_states * output_len * sizeof(Image), cudaMemcpyDeviceToHost, cfg.ctx.stream));
+        CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
+      }
+
+      if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+
+      return CHK_LAST();
+    };
+  };
+} // namespace hash
+
+#endif
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -3,54 +3,38 @@
 #define KECCAK_H

 #include <cstdint>
-#include "../../gpu-utils/device_context.cuh"
-#include "../../gpu-utils/error_handler.cuh"
-typedef int cudaError_t;
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+
+using namespace hash;
+
 namespace keccak {
-  /**
-   * @struct KeccakConfig
-   * Struct that encodes various Keccak parameters.
-   */
-  struct KeccakConfig {
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
-    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
-    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
-    bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
-                    *   non-blocking and you'd need to synchronize it explicitly by running
-                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
-                    *   function will block the current CPU thread. */
-  };
+  // Input rate in bytes
+  const int KECCAK_256_RATE = 136;
+  const int KECCAK_512_RATE = 72;

-  KeccakConfig default_keccak_config()
+  // Digest size in u64
+  const int KECCAK_256_DIGEST = 4;
+  const int KECCAK_512_DIGEST = 8;
+
+  // Number of state elements in u64
+  const int KECCAK_STATE_SIZE = 25;
+
+  class Keccak : public Hasher<uint8_t, uint64_t>
  {
-    device_context::DeviceContext ctx = device_context::get_default_device_context();
-    KeccakConfig config = {
-      ctx,   // ctx
-      false, // are_inputes_on_device
-      false, // are_outputs_on_device
-      false, // is_async
-    };
-    return config;
-  }
+  public:
+    cudaError_t run_hash_many_kernel(
+      const uint8_t* input,
+      uint64_t* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const override;

-  /**
-   * Compute the keccak hash over a sequence of preimages.
-   * Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
-   * / 64} u64
-   * @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
-   * @tparam D - number of bits of output. Only multiples of 64 are supported.
-   * @param input a pointer to the input data. May be allocated on device or on host, regulated
-   * by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
-   * number_of_blocks)}.
-   * @param input_block_size - size of each input block in bytes. Should be divisible by 8.
-   * @param number_of_blocks number of input and output blocks. One GPU thread processes one block
-   * @param output a pointer to the output data. May be allocated on device or on host, regulated
-   * by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
-   * number_of_blocks)}
-   */
-  template <int C, int D>
-  cudaError_t
-  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config);
+    Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(KECCAK_STATE_SIZE, KECCAK_STATE_SIZE, rate, 0) {}
+  };
 } // namespace keccak

 #endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
release-bot	aacec3f72f	Bump rust crates' version icicle-babybear@2.8.0 icicle-bls12-377@2.8.0 icicle-bls12-381@2.8.0 icicle-bn254@2.8.0 icicle-bw6-761@2.8.0 icicle-core@2.8.0 icicle-cuda-runtime@2.8.0 icicle-grumpkin@2.8.0 icicle-hash@2.8.0 icicle-m31@2.8.0 icicle-stark252@2.8.0 Generated by cargo-workspaces	2024-07-16 13:57:56 +00:00
ChickenLover	a8fa05d0e3	Feat/roman/hash docs (#556 ) ## Describe the changes This PR... ## Linked Issues Resolves # --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-07-16 16:39:35 +03:00
ChickenLover	ea71faf1fa	add keccak tree builder (#555 )	2024-07-15 15:31:12 +07:00
ChickenLover	7fd9ed1b49	Feat/roman/tree builder (#525 ) # Updates: ## Hashing - Added SpongeHasher class - Can be used to accept any hash function as an argument - Absorb and squeeze are now separated - Memory management is now mostly done by SpongeHasher class, each hash function only describes permutation kernels ## Tree builder - Tree builder is now hash-agnostic. - Tree builder now supports 2D input (matrices) - Tree builder can now use two different hash functions for layer 0 and compression layers ## Poseidon1 - Interface changed to classes - Now allows for any alpha - Now allows passing constants not in a single vector - Now allows for any domain tag - Constants are now released upon going out of scope - Rust wrappers changed to Poseidon struct ## Poseidon2 - Interface changed to classes - Constants are now released upon going out of scope - Rust wrappers changed to Poseidon2 struct ## Keccak - Added Keccak class which inherits SpongeHasher - Now doesn't use gpu registers for storing states To do: - [x] Update poseidon1 golang bindings - [x] Update poseidon1 examples - [x] Fix poseidon2 cuda test - [x] Fix poseidon2 merkle tree builder test - [x] Update keccak class with new design - [x] Update keccak test - [x] Check keccak correctness - [x] Update tree builder rust wrappers - [x] Leave doc comments Future work: - [ ] Add keccak merkle tree builder externs - [ ] Add keccak rust tree builder wrappers - [ ] Write docs - [ ] Add example - [ ] Fix device output for tree builder --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com>	2024-07-11 13:46:25 +07:00
DmytroTym	2d4059c61f	Field creation automated through macros (#551 ) Params files for fields now only require modulus specified by the user (also twiddle generator and/or non-residue in case either or both are needed). Everything else gets generated by a macro.	2024-07-08 10:39:50 +03:00
release-bot	73cd4c0a99	Bump rust crates' version icicle-babybear@2.7.1 icicle-bls12-377@2.7.1 icicle-bls12-381@2.7.1 icicle-bn254@2.7.1 icicle-bw6-761@2.7.1 icicle-core@2.7.1 icicle-cuda-runtime@2.7.1 icicle-grumpkin@2.7.1 icicle-hash@2.7.1 icicle-m31@2.7.1 icicle-stark252@2.7.1 Generated by cargo-workspaces	2024-07-04 12:34:26 +00:00
yshekel	5516320ad7	fix large (>512 elements) ecntt issue (#553 ) This PR solves an issue for large ecntt where cuda blocks are too large and cannot be assigned to SMs. The fix is to reduce thread count per block and increase block count in that case.	2024-07-04 15:33:49 +03:00
Vlad	a4b1eb3de9	Fix affine to projective zero point bug (#552 ) ## Describe the changes This PR fixes affine to projective functions in bindings by adding a condition if the point in affine form is zero then return the projective zero --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-07-04 09:31:59 +03:00
release-bot	31083463be	Bump rust crates' version icicle-babybear@2.7.0 icicle-bls12-377@2.7.0 icicle-bls12-381@2.7.0 icicle-bn254@2.7.0 icicle-bw6-761@2.7.0 icicle-core@2.7.0 icicle-cuda-runtime@2.7.0 icicle-grumpkin@2.7.0 icicle-hash@2.7.0 icicle-m31@2.7.0 icicle-stark252@2.7.0 Generated by cargo-workspaces	2024-07-03 19:06:35 +00:00
nonam3e	b908053c0c	Feat/m31 (#547 ) This PR adds support of the m31 Field --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-07-03 20:48:28 +07:00
Stas	29da36d7be	RISC0 example using Polynomial API (#548 ) ## New Example This new c++ example shows the basics of RISC0 protocol using our Polynomial API	2024-07-02 08:00:03 -06:00
HadarIngonyama	4fef542346	MSM - fixed bug in reduction phase (#549 ) This PR fixes a bug in the iterative reduction algorithm. There were unsynchronized threads reading and writing to the same addresses that caused MSM to fail a small percentage of the time - this is fixed now.	2024-06-30 12:05:55 +03:00
release-bot	f812f071fa	Bump rust crates' version icicle-babybear@2.6.0 icicle-bls12-377@2.6.0 icicle-bls12-381@2.6.0 icicle-bn254@2.6.0 icicle-bw6-761@2.6.0 icicle-core@2.6.0 icicle-cuda-runtime@2.6.0 icicle-grumpkin@2.6.0 icicle-hash@2.6.0 icicle-stark252@2.6.0 Generated by cargo-workspaces	2024-06-24 11:56:28 +00:00
Jeremy Felder	2b07513310	[FEAT]: Golang Bindings for pinned host memory (#519 ) ## Describe the changes This PR adds the capability to pin host memory in golang bindings allowing data transfers to be quicker. Memory can be pinned once for multiple devices by passing the flag `cuda_runtime.CudaHostRegisterPortable` or `cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is called	2024-06-24 14:03:44 +03:00
HadarIngonyama	7831f7bd0f	Msm/update docs (#545 ) Updates MSM documentation --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>	2024-06-19 11:38:24 +03:00
Otsar	de25b6e203	Added v2 paper (#544 )	2024-06-18 15:19:49 +03:00
Otsar	69383e6c73	Update docusaurus.config.js bold, added emoji	2024-06-18 15:04:26 +03:00
Otsar	c305aade5d	Update overview.md	2024-06-18 15:00:24 +03:00
Otsar	87bdf04a19	Update docusaurus.config.js	2024-06-18 13:05:14 +03:00
Otsar	e152977843	Update overview.md Added v2 paper	2024-06-18 12:23:03 +03:00
release-bot	3d01c09c82	Bump rust crates' version icicle-babybear@2.5.0 icicle-bls12-377@2.5.0 icicle-bls12-381@2.5.0 icicle-bn254@2.5.0 icicle-bw6-761@2.5.0 icicle-core@2.5.0 icicle-cuda-runtime@2.5.0 icicle-grumpkin@2.5.0 icicle-hash@2.5.0 icicle-stark252@2.5.0 Generated by cargo-workspaces	2024-06-17 13:17:24 +00:00
HadarIngonyama	8936d9c800	MSM - supporting all window sizes (#534 ) This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-06-17 15:57:24 +03:00
Jeremy Felder	af9ec76506	Fix link and correct path for running test deploy workflow (#542 ) ## Describe the changes Fixes a link issue in docs preventing deployment	2024-06-17 15:44:15 +03:00
Otsar	cdd99d2a46	recreated images for poseidon.md (#541 ) Fixed 3 images shown in low quality - i have recreated the 3 images - please check me to see that i have not made a mistake	2024-06-17 12:16:26 +03:00
Jeremy Felder	3e551762c0	Updated alt text for images and fixed broken link	2024-06-16 18:35:42 +03:00
Otsar	37c22e81e7	Update poseidon.md fixed - added arrows	2024-06-16 15:01:12 +03:00
Otsar	69e73ffa3e	Update poseidon.md Fixed image quality	2024-06-16 11:42:46 +03:00
cangqiaoyuzhuo	512e1ca372	chore: remove repeat word (#540 ) ## Describe the changes remove repeat word ## Linked Issues Resolves # Signed-off-by: cangqiaoyuzhuo <850072022@qq.com>	2024-06-13 11:53:22 +03:00
VitaliiH	e19a869691	accumulate stwo (#535 ) adds in-place vector addition and api as accumulate	2024-06-10 12:24:58 +02:00
yshekel	9c55d888ae	workflow curve fix (#536 )	2024-06-09 11:18:23 +03:00
release-bot	18f51de56c	Bump rust crates' version icicle-babybear@2.4.0 icicle-bls12-377@2.4.0 icicle-bls12-381@2.4.0 icicle-bn254@2.4.0 icicle-bw6-761@2.4.0 icicle-core@2.4.0 icicle-cuda-runtime@2.4.0 icicle-grumpkin@2.4.0 icicle-hash@2.4.0 icicle-stark252@2.4.0 Generated by cargo-workspaces	2024-06-06 14:42:36 +00:00
yshekel	33b1f3c794	perf: projective scalar multiplication use dbl() rather than + (#530 )	2024-06-05 20:35:21 +03:00
Karthik Inbasekar	3a276ef23c	added example cpp: example_commit_with_device_memory_view() (#532 ) ## Describe the changes This PR... Added an example for simple commit that makes use of polynomial views. Output attached ``` Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1 Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 f_2]_1 Setup: Generating mock SRS Setup: SRS of length 1025 generated and loaded to device. Took: 19557 milliseconds Setup: Generating polys (on device) f1,f2 of log degree 10 Setup: Gen poly done. Took: 7 milliseconds Computing constraints..start Computing constraints..done. Took: 0 milliseconds Computing Commitments with poly view Commitments done. Took: 29 milliseconds commitment [(f1+f2)^2 + (f1-f2)^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [[2 (f_1^2+f_2^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [(f1+f2)^2 - (f1-f2)^2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] commitment [4 f_1f_2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] ``` ## Linked Issues Resolves #	2024-06-05 18:25:12 +03:00
nonam3e	8e62bde16d	bit reverse (#528 ) This PR adds bit reverse operation support to icicle	2024-06-02 16:37:58 +07:00
Jeremy Felder	417ca77f61	precompute bug fix (#529 ) This PR fixes 2 things: 1. Removes the assertion regarding the precompute factor needing to be a power of 2. There is no such requirement and it works just fine for other values too. 2. Fixes the average bucket size for the large buckets threshold - it depends on the precompute factor.	2024-05-29 13:59:48 +03:00
hadaringonyama	8911a32135	precompute bug fix	2024-05-28 12:48:48 +03:00
release-bot	c6f6e61d60	Bump rust crates' version icicle-babybear@2.3.1 icicle-bls12-377@2.3.1 icicle-bls12-381@2.3.1 icicle-bn254@2.3.1 icicle-bw6-761@2.3.1 icicle-core@2.3.1 icicle-cuda-runtime@2.3.1 icicle-grumpkin@2.3.1 icicle-hash@2.3.1 icicle-stark252@2.3.1 Generated by cargo-workspaces	2024-05-20 13:43:32 +00:00
yshekel	4e3aa63d2f	fix: ntt mixed-radix bug for large ntts (>4G elements) (#523 ) in some cases 32b values would wrap around and cause invalid accesses to wrong elements and memory addresses	2024-05-20 16:42:44 +03:00
Leon Hibnik	db298aefc1	[HOTFIX] rust msm benchmarks (#521 ) ## Describe the changes removes unused host to device copy, adds minimum limit to run MSM benchmarks	2024-05-20 13:51:53 +03:00
yshekel	19a9b76d64	fix: cmake set_gpu_env() and windows build (#520 )	2024-05-20 13:05:45 +03:00
Jeremy Felder	1e343f17a3	Allow overriding compiler's chosen GPU arch via cmake (#518 ) ## Describe the changes This PR modifies icicle/cmake/Common.cmake to set CMAKE_CUDA_ARCHITECTURES to ${CUDA_ARCH} if the user defines the arch, to set CMAKE_CUDA_ARCHITECTURES to native if the cmake version is greater than or equal to 3.24.0. This change has been successfully tested with cmake 3.22.0 and 3.25.2. ## Linked Issues Resolves #167.	2024-05-19 16:03:15 +03:00
liuhao230	cfea6ebb3b	Merge branch 'ingonyama-zk:main' into main	2024-05-17 14:24:02 +08:00
release-bot	76a82bf88e	Bump rust crates' version icicle-babybear@2.3.0 icicle-bls12-377@2.3.0 icicle-bls12-381@2.3.0 icicle-bn254@2.3.0 icicle-bw6-761@2.3.0 icicle-core@2.3.0 icicle-cuda-runtime@2.3.0 icicle-grumpkin@2.3.0 icicle-hash@2.3.0 icicle-stark252@2.3.0 Generated by cargo-workspaces	2024-05-17 04:42:17 +00:00
Vlad	b8310d577e	Feat/vlad/poseidon go binding (#513 )	2024-05-17 07:20:15 +03:00
liu	49c7fa4b28	fix: add the PARENT_SCOPE Signed-off-by: liu <liuhao2206@buaa.edu.cn>	2024-05-17 10:45:09 +08:00
Stas	02059fcfaa	Stas/best-practice-ntt (#517 ) ## Describe the changes Icicle examples: Concurrent Data Transfer and NTT Computation This PR introduces a Best Practice series of examples in c++. Specifically, the example shows how to concurrently transfer data to/from device and execute NTT ## Linked Issues Resolves #	2024-05-16 23:51:49 +03:00
nonam3e	4496520a10	golang examples init (#516 ) ## Describe the changes This PR adds golang examples --------- Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-05-16 19:40:13 +03:00
liu	88a6966a4b	Allow overriding compiler's chosen GPU arch via cmake	2024-05-15 22:40:51 +08:00
yshekel	9c1afe8a44	Polynomial API views replaced by evaluation on rou domain (#514 ) - removed poly API to access view of evaluations. This is a problematic API since it cannot handle small domains and for large domains requires the polynomial to use more memory than need to. - added evaluate_on_rou_domain() API instead that supports any domain size (powers of two size). - the new API can compute to HOST or DEVICE memory - Rust wrapper for evaluate_on_rou_domain() - updated documentation: overview and Rust wrappers - faster division by vanishing poly for common case where numerator is 2N and vanishing poly is of degree N. - allow division a/b where deg(a)<deg(b) instead of throwing an error.	2024-05-15 14:06:23 +03:00
Jeremy Felder	972b924bc0	Update CI to run on some non-code changes (#515 ) ## Describe the changes This PR: - Updates the CI to run on CI workflow file changes - Updates examples CI to run on examples file changes	2024-05-15 13:17:13 +03:00
sukrucildirr	230a1da512	Fix broken link (#512 ) ## Describe the changes There was a broken link is linked to ZKContainer word. ## Linked Issues Resolves #	2024-05-14 08:36:39 +07:00
release-bot	940b283c47	Bump rust crates' version icicle-babybear@2.2.0 icicle-bls12-377@2.2.0 icicle-bls12-381@2.2.0 icicle-bn254@2.2.0 icicle-bw6-761@2.2.0 icicle-core@2.2.0 icicle-cuda-runtime@2.2.0 icicle-grumpkin@2.2.0 icicle-hash@2.2.0 icicle-stark252@2.2.0 Generated by cargo-workspaces	2024-05-09 12:27:17 +00:00
Leon Hibnik	e0412183fd	syntax highlight (#511 ) ## Describe the changes adds syntax highlighting to `rust` and `go`	2024-05-09 15:23:20 +03:00
ChickenLover	9da52bc09f	Feat/roman/poseidon2 (#510 ) # This PR 1. Adds C++ API 2. Renames a lot of API functions 3. Adds inplace poseidon2 4. Makes input const at all poseidon functions 5. Adds benchmark for poseidon2	2024-05-09 19:19:55 +07:00
VitaliiH	49079d0d2a	rust ecntt hotfix (#509 ) ## Describe the changes This PR fixes Rust ECNTT benches and tests --------- Co-authored-by: VitaliiH <Vitaliy@ingo>	2024-05-09 11:21:21 +03:00
ChickenLover	094683d291	Feat/roman/poseidon2 (#507 ) This PR adds support for poseidon2 permutation function as described in https://eprint.iacr.org/2023/323.pdf Reference implementations used (and compared against): https://github.com/HorizenLabs/poseidon2/tree/main https://github.com/Plonky3/Plonky3/tree/main Tasks: - [x] Remove commented code and prints - [ ] Add doc-comments to functions and structs - [x] Fix possible issue with Plonky3 imports - [x] Update NTT/Plonky3 test - [x] Add Plonky3-bn254 test (impossible)	2024-05-09 15:13:43 +07:00
nonam3e	c30e333819	keccak docs (#508 ) This PR adds keccak docs --------- Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>	2024-05-08 23:18:59 +03:00
yshekel	2905d2a469	fix: bug regarding polynomial evaluations view in CUDA backend (#506 ) fixing: (1) not building polynomials and tests for grumpkin curve (no NTT) (2) polynomial API C++ example compilation and (when compilation is fixed) memory corruption (3) bug fix in poly CUDA backend regarding transformation to evaluations in some cases	2024-05-08 21:02:18 +03:00
Jeremy Felder	732ee51552	[CI]: Update Cpp CI to include build args (#503 ) ## Describe the changes This PR adds build args to the Cpp CI and adds grumpkin curve and stark252 field	2024-05-08 14:35:02 +03:00
Jeremy Felder	14997566ff	[FIX]: Fix releasing device set on host thread during multigpu call (#501 ) ## Describe the changes This PR fixes an issue when `RunOnDevice` is called for multi-gpu while other goroutines calling device operations are run outside of `RunOnDevice`. The issue comes from setting a device other than the default device (device 0) on a host thread within `RunOnDevice` and not unsetting that host threads device when `RunOnDevice` finishes. When `RunOnDevice` locks a host thread to ensure that all other calls in the go routine are on the same device, it never unsets that thread’s device. Once the thread is unlocked, other go routines can get scheduled to it but it still has the device set to whatever it was before while it was locked so its possible that the following sequence happens: 1. NTT domain is initialized on thread 2 via a goroutine on device 0 2. MSM multiGPU test runs and is locked on thread 3 setting its device to 1 3. Other tests run concurrently on threads other than 3 (since it is locked) 4. MSM multiGPU test finishes and release thread 3 back to the pool but its device is still 1 5. NTT test runs and is assigned to thread 3 --> this will fail because the thread’s device wasn’t released back We really only want to set a thread's device while the thread is locked. But once we unlock a thread, it’s device should return to whatever it was set at originally. In theory, it should always be 0 if `SetDevice` is never used outside of `RunOnDevice` - which it shouldn’t be in most situations	2024-05-08 14:07:29 +03:00
Otsar	a56435d2e8	Updated hall of fame (#505 ) ## Describe the changes Adds Patrick to Hall of fame	2024-05-07 14:41:38 +03:00