Specify curve as bn254

integration with V2
Merge branch 'V2' into sumcheck-experiments
2026-01-12 08:58:09 -05:00 · 2024-08-13 17:59:48 +09:00 · 2024-04-17 11:19:38 +03:00 · 2024-04-15 17:43:10 +03:00 · 2024-04-14 16:31:49 +03:00 · 2024-04-14 18:23:43 +07:00
416 changed files with 284145 additions and 5486 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -1,10 +1,10 @@
 golang:
-  - wrappers/golang/**/*.go'
-  - wrappers/golang/**/*.h'
-  - wrappers/golang/**/*.tmpl'
+  - wrappers/golang/**/*.go
+  - wrappers/golang/**/*.h
+  - wrappers/golang/**/*.tmpl
  - go.mod
 rust:
-  - wrappers/rust
+  - wrappers/rust/**/*
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -0,0 +1,39 @@
+name: Check Changed Files
+
+on:
+  workflow_call:
+    outputs:
+      golang:
+        description: "Flag for if GoLang files changed"
+        value: ${{ jobs.check-changed-files.outputs.golang }}
+      rust:
+        description: "Flag for if Rust files changed"
+        value: ${{ jobs.check-changed-files.outputs.rust }}
+      cpp_cuda:
+        description: "Flag for if C++/CUDA files changed"
+        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -4,14 +4,14 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2

 jobs:
  spelling-checker:
    name: Check Spelling
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: codespell-project/actions-codespell@v2
        with:
          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -0,0 +1,74 @@
+name: C++/CUDA
+
+on:
+  pull_request:
+    branches:
+      - main
+      - V2
+  push:
+    branches:
+      - main
+      - V2
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changed-files:
+    uses: ./.github/workflows/check-changed-files.yml
+
+  check-format:
+    name: Check Code Format
+    runs-on: ubuntu-22.04
+    needs: check-changed-files
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Check clang-format
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
+
+  test-linux-curve:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Build curve
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ curve Tests
+      working-directory: ./icicle/build/tests
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+
+  test-linux-field:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        field: [babybear]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Build field
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build && rm -rf build/*
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
+        cmake --build build -j
+    - name: Run C++ field Tests
+      working-directory: ./icicle/build/tests
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/*'
+      - 'docs/**'

 permissions:
  contents: write
@@ -40,7 +40,7 @@ jobs:
        uses: peaceiris/actions-gh-pages@v3
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
-          publish_dir: ./build
+          publish_dir: ./repo/docs/build
          user_name: github-actions[bot]
          user_email: 41898282+github-actions[bot]@users.noreply.github.com
          working-directory: ./repo/docs
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -11,24 +11,29 @@ on:
  pull_request:
    branches:
      - main
-      - dev
+      - V2
  push:
    branches:
      - main
-      - dev
+      - V2

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

-jobs:  
-  test-examples:
+jobs:
+  check-changed-files:
+    uses: ./.github/workflows/check-changed-files.yml
+
+  run-examples:
    runs-on: [self-hosted, Linux, X64, icicle, examples]
+    needs: check-changed-files
    steps:
    - name: Checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
    - name: c++ examples
      working-directory: ./examples/c++
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -42,6 +47,7 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
+      if: needs.check-changed-files.outputs.rust == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -0,0 +1,121 @@
+name: GoLang
+
+on:
+  pull_request:
+    branches:
+      - main
+      - V2
+  push:
+    branches:
+      - main
+      - V2
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changed-files:
+    uses: ./.github/workflows/check-changed-files.yml
+
+  check-format:
+    name: Check Code Format
+    runs-on: ubuntu-22.04
+    needs: check-changed-files
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Setup go
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.20.0'
+    - name: Check gofmt
+      if: needs.check-changed-files.outputs.golang == 'true'
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  build-linux:
+    name: Build on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Setup go
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.20.0'
+    - name: Build
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ./build.sh ${{ matrix.curve }} -g2 -ecntt # builds a single curve with G2 and ECNTT enabled
+    - name: Upload ICICLE lib artifacts
+      uses: actions/upload-artifact@v4
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      with:
+        name: icicle-builds-${{ matrix.curve }}-${{ github.workflow }}-${{ github.sha }}
+        path: |
+          icicle/build/src/curves/libingo_curve_${{ matrix.curve }}.a
+          icicle/build/src/fields/libingo_field_${{ matrix.curve }}.a
+        retention-days: 1
+  
+  test-linux:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, build-linux]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Setup go
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.20.0'
+    - name: Download ICICLE lib artifacts
+      uses: actions/download-artifact@v4
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      with:
+        path: ./icicle/build/src
+        merge-multiple: true
+    - name: Run Tests
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # -count ensures the test results are not cached
+      # -p controls the number of programs that can be run in parallel
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -failfast -p 2 -timeout 60m
+  
+  # TODO: bw6 on windows requires more memory than the standard runner has
+  # Add a large runner and then enable this job
+  # build-windows:
+  #   name: Build on Windows
+  #   runs-on: windows-2022
+  #   needs: [check-changed-files, check-format]
+  #   strategy:
+  #     matrix:
+  #       curve: [bn254, bls12_381, bls12_377, bw6_761]
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v4
+  #   - name: Setup go
+  #     uses: actions/setup-go@v5
+  #     with:
+  #       go-version: '1.20.0'
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     id: cuda-toolkit
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+  #   - name: Build libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     working-directory: ./wrappers/golang
+  #     env:
+  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+  #     shell: pwsh
+  #     run: ./build.ps1 ${{ matrix.curve }} ON # builds a single curve with G2 enabled
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -1,119 +0,0 @@
-name: Build
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-  push:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: native
-
-jobs:
-  check-changed-files:
-    name: Check Changed Files
-    runs-on: ubuntu-22.04
-    outputs:
-      golang: ${{ steps.changed_files.outputs.golang }}
-      rust: ${{ steps.changed_files.outputs.rust }}
-      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Get all changed files
-      id: changed-files-yaml
-      uses: tj-actions/changed-files@v39
-      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
-      with:
-        files_yaml_from_source_file: .github/changed-files.yml
-    - name: Run Changed Files script
-      id: changed_files
-      # https://github.com/tj-actions/changed-files#outputs-
-      run: |
-        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
-
-  build-rust-linux:
-    name: Build Rust on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build Rust
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
-
-  build-rust-windows:
-    name: Build Rust on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Download and Install Cuda
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build Rust Targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
-
-  build-golang-linux:
-    name: Build Golang on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build CUDA libs
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      working-directory: ./wrappers/golang
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        ./build.sh ${{ matrix.curve }} ON
-
-  # TODO: Add once Golang make file supports building for Windows
-  # build-golang-windows:
-  #   name: Build Golang on Windows
-  #   runs-on: windows-2022
-  #   needs: check-changed-files
-  #   steps:     
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v3
-  #   - name: Download and Install Cuda
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     uses: Jimver/cuda-toolkit@v0.2.11
-  #     with:
-  #       cuda: '12.0.0'
-  #       method: 'network'
-  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-  #       sub-packages: '["cudart", "nvcc", "thrust"]'
-  #   - name: Build cpp libs
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     run: make all
-  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -1,47 +0,0 @@
-name: Format
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  formatting-rust:
-    name: Check Rust Code Formatting
-    runs-on: ubuntu-22.04
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Check rustfmt
-      working-directory: ./wrappers/rust
-      # "-name tagret -prune" removes searching in any directory named "target"
-      # Formatting by single file is necessary due to generated files not being present
-      # before building the project.
-      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
-      # causing rustfmt to fail.
-      run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
-    # - name: Check clippy
-    #   run: cargo clippy --no-deps --all-features --all-targets
-
-  formatting-golang:
-    name: Check Golang Code Formatting
-    runs-on: ubuntu-22.04
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Check gofmt
-      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
-
-  formatting-cpp-cuda:
-    name: Check C++/CUDA Code Formatting
-    runs-on: ubuntu-22.04
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-    - name: Check clang-format
-      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -1,99 +0,0 @@
-name: Test
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: native
-
-jobs:
-  check-changed-files:
-    name: Check Changed Files
-    runs-on: ubuntu-22.04
-    outputs:
-      golang: ${{ steps.changed_files.outputs.golang }}
-      rust: ${{ steps.changed_files.outputs.rust }}
-      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Get all changed files
-      id: changed-files-yaml
-      uses: tj-actions/changed-files@v39
-      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
-      with:
-        files_yaml_from_source_file: .github/changed-files.yml
-    - name: Run Changed Files script
-      id: changed_files
-      # https://github.com/tj-actions/changed-files#outputs-
-      run: |
-        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
-
-  test-rust-linux:
-    name: Test Rust on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Run Rust Tests
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Running tests from the root workspace will run all workspace members' tests by default
-      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      run: cargo test --release --verbose --features=g2 -- --test-threads=2
-
-  test-cpp-linux:
-    name: Test C++ on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build C++
-      working-directory: ./icicle
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        mkdir -p build
-        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
-        cmake --build build
-    - name: Run C++ Tests
-      working-directory: ./icicle/build
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ctest
-  
-  test-golang-linux:
-    name: Test Golang on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: check-changed-files
-    # strategy:
-    #   matrix:
-    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v3
-    - name: Build CUDA libs
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # builds all curves with g2 ON
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        ./build.sh all ON
-    - name: Run Golang Tests
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        go test --tags=g2 ./... -count=1 -timeout 60m
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -0,0 +1,50 @@
+name: Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      releaseType:
+        description: 'Release type'
+        required: true
+        default: 'minor'
+        type: choice
+        options:
+          - patch
+          - minor
+          - major
+
+jobs:
+  release:
+    name: Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ssh-key: ${{ secrets.DEPLOY_KEY }}
+      - name: Setup Cache
+        id: cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+          key: ${{ runner.os }}-cargo-${{ hashFiles('~/.cargo/bin/cargo-workspaces') }}
+      - name: Install cargo-workspaces
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: cargo install cargo-workspaces
+      - name: Bump rust crate versions, commit, and tag
+        working-directory: wrappers/rust
+        # https://github.com/pksunkara/cargo-workspaces?tab=readme-ov-file#version
+        run: |
+          git config user.name release-bot
+          git config user.email release-bot@ingonyama.com
+          cargo workspaces version ${{ inputs.releaseType }} -y --no-individual-tags -m "Bump rust crates' version"
+      - name: Create draft release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          LATEST_TAG=$(git describe --tags --abbrev=0)
+          gh release create $LATEST_TAG --generate-notes -d --verify-tag -t "Release $LATEST_TAG"
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -0,0 +1,91 @@
+name: Rust
+
+on:
+  pull_request:
+    branches:
+      - main
+      - V2
+  push:
+    branches:
+      - main
+      - V2
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changed-files:
+    uses: ./.github/workflows/check-changed-files.yml
+
+  check-format:
+    name: Check Code Format
+    runs-on: ubuntu-22.04
+    needs: check-changed-files
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Check rustfmt
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/rust
+      # "-name target -prune" removes searching in any directory named "target"
+      # Formatting by single file is necessary due to generated files not being present
+      # before building the project.
+      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
+      # causing rustfmt to fail.
+      run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
+
+  build-linux:
+    name: Build on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, check-format]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Build
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+  
+  test-linux:
+    name: Test on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: [check-changed-files, build-linux]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Run tests
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Running tests from the root workspace will run all workspace members' tests by default
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --workspace --exclude icicle-babybear --release --verbose --features=g2 -- --test-threads=2
+    - name: Run baby bear tests
+      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: cargo test --release --verbose
+
+  build-windows:
+    name: Build on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v4
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build targets
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  test-deploy:
-    name: Test deployment of docs webiste
+    name: Test deployment of docs website
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,6 @@
 **/Cargo.lock
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
-**/build
+**/build*
 **/icicle/appUtils/large_ntt/work
 icicle/appUtils/large_ntt/work/test_ntt
--- a/2
+++ b/2
@@ -15,7 +15,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"

 # Install Golang
 ENV GOLANG_VERSION 1.21.1
-RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+RUN curl -L https://go.dev/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
 ENV PATH="/usr/local/go/bin:${PATH}"

 # Set the working directory in the container
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # ICICLE

-**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**
+<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>

 <p align="center">
  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
@@ -11,10 +11,12 @@
  </a>
  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
+  <a href="https://github.com/ingonyama-zk/icicle/releases">
+    <img src="https://img.shields.io/github/v/release/ingonyama-zk/icicle" alt="GitHub Release">
  </a>
-  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
 </p>

+
 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
@@ -113,8 +115,10 @@ This will ensure our custom hooks are run and will make it easier to follow our

 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab
 - [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
+- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
+- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile

 ## Help & Support

--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,3 +1,105 @@
 # Golang bindings

-Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
+Golang bindings allow you to use ICICLE as a golang library.
+The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
+
+The Golang bindings are comprised of multiple packages.
+
+[`core`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/core) which defines all shared methods and structures, such as configuration structures, or memory slices.
+
+[`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.
+
+Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
+
+## Using ICICLE Golang bindings in your project
+
+To add ICICLE to your `go.mod` file.
+
+```bash
+go get github.com/ingonyama-zk/icicle
+```
+
+If you want to specify a specific branch
+
+```bash
+go get github.com/ingonyama-zk/icicle@<branch_name>
+```
+
+For a specific commit
+
+```bash
+go get github.com/ingonyama-zk/icicle@<commit_id>
+```
+
+To build the shared libraries you can run this script:
+
+```
+./build <curve> [G2_enabled]
+
+curve - The name of the curve to build or "all" to build all curves
+G2_enabled - Optional - To build with G2 enabled 
+```
+
+For example if you want to build all curves with G2 enabled you would run:
+
+```bash
+./build.sh all ON
+```
+
+If you are interested in building a specific curve you would run:
+
+```bash
+./build.sh bls12_381 ON
+```
+
+Now you can import ICICLE into your project
+
+```golang
+import (
+    "github.com/stretchr/testify/assert"
+    "testing"
+
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+...
+```
+
+## Running tests
+
+To run all tests, for all curves:
+
+```bash
+go test --tags=g2 ./... -count=1
+```
+
+If you dont want to include g2 tests then drop `--tags=g2`.
+
+If you wish to run test for a specific curve:
+
+```bash
+go test <path_to_curve> -count=1
+```
+
+## How do Golang bindings work?
+
+The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+
+1. These libraries (named `libingo_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
+
+2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lingo_bn254
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+    // Now you can call the C functions from the ICICLE libraries.
+    // Note that C function calls are prefixed with 'C.' in Go code.
+}
+```
+
+Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -0,0 +1,92 @@
+# MSM Pre computation
+
+To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
+
+## Core package
+
+## MSM `PrecomputeBases`
+
+`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves. 
+
+#### Description
+
+This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
+
+The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
+
+#### `PrecomputeBases`
+
+Precomputes bases for MSM by extending each base point with its multiples.
+
+```go
+func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+```
+
+##### Parameters
+
+- **`points`**: A slice of the original affine points to be extended with their multiples.
+- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
+- **`c`**: Currently unused; reserved for future compatibility.
+- **`ctx`**: CUDA device context specifying the execution environment.
+- **`outputBases`**: The device slice allocated for storing the extended bases.
+
+##### Example
+
+```go
+cfg := GetDefaultMSMConfig()
+points := GenerateAffinePoints(1024)
+precomputeFactor := 8
+var precomputeOut core.DeviceSlice
+_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+
+err := PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+if err != cr.CudaSuccess {
+    log.Fatalf("PrecomputeBases failed: %v", err)
+}
+```
+
+#### `G2PrecomputeBases`
+
+This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
+
+```go
+func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+```
+
+##### Parameters
+
+- **`points`**: A slice of G2 curve points to be extended.
+- **`precomputeFactor`**: The total number of points to precompute for each base.
+- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
+- **`ctx`**: Specifies the CUDA device context for execution.
+- **`outputBases`**: Allocated device slice for the extended bases.
+
+##### Example
+
+```go
+cfg := G2GetDefaultMSMConfig()
+points := G2GenerateAffinePoints(1024)
+precomputeFactor := 8
+var precomputeOut core.DeviceSlice
+_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
+
+err := G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+if err != cr.CudaSuccess {
+    log.Fatalf("G2PrecomputeBases failed: %v", err)
+}
+```
+
+### Benchmarks
+
+Benchmarks where performed on a Nvidia RTX 3090Ti.
+
+| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
+| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
+| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
+| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -0,0 +1,200 @@
+# MSM
+
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
+
+## MSM Example
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+
+func Main() {
+    // Obtain the default MSM configuration.
+    cfg := GetDefaultMSMConfig()
+    
+    // Define the size of the problem, here 2^18.
+    size := 1 << 18
+
+    // Generate scalars and points for the MSM operation.
+    scalars := GenerateScalars(size)
+    points := GenerateAffinePoints(size)
+
+    // Create a CUDA stream for asynchronous operations.
+    stream, _ := cr.CreateStream()
+    var p Projective
+    
+    // Allocate memory on the device for the result of the MSM operation.
+    var out core.DeviceSlice
+    _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+
+    if e != cr.CudaSuccess {
+        panic(e)
+    }
+    
+    // Set the CUDA stream in the MSM configuration.
+    cfg.Ctx.Stream = &stream
+    cfg.IsAsync = true
+    
+    // Perform the MSM operation.
+    e = Msm(scalars, points, &cfg, out)
+    
+    if e != cr.CudaSuccess {
+        panic(e)
+    }
+    
+    // Allocate host memory for the results and copy the results from the device.
+    outHost := make(core.HostSlice[Projective], 1)
+    cr.SynchronizeStream(&stream)
+    outHost.CopyFromDevice(&out)
+    
+    // Free the device memory allocated for the results.
+    out.Free()
+}
+```
+
+## MSM Method
+
+```go
+func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) cr.CudaError
+```
+
+### Parameters
+
+- **scalars**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
+- **points**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
+- **cfg**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
+- **results**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.
+
+### Return Value
+
+- **CudaError**: Returns a CUDA error code indicating the success or failure of the MSM operation.
+
+## MSMConfig
+
+The `MSMConfig` structure holds configuration parameters for the MSM operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+
+```go
+type MSMConfig struct {
+    Ctx cr.DeviceContext
+    PrecomputeFactor int32
+    C int32
+    Bitsize int32
+    LargeBucketFactor int32
+    batchSize int32
+    areScalarsOnDevice bool
+    AreScalarsMontgomeryForm bool
+    arePointsOnDevice bool
+    ArePointsMontgomeryForm bool
+    areResultsOnDevice bool
+    IsBigTriangle bool
+    IsAsync bool
+}
+```
+
+### Fields
+
+- **Ctx**: Device context containing details like device id and stream.
+- **PrecomputeFactor**: Controls the number of extra points to pre-compute.
+- **C**: Window bitsize, a key parameter in the "bucket method" for MSM.
+- **Bitsize**: Number of bits of the largest scalar.
+- **LargeBucketFactor**: Sensitivity to frequently occurring buckets.
+- **batchSize**: Number of results to compute in one batch.
+- **areScalarsOnDevice**: Indicates if scalars are located on the device.
+- **AreScalarsMontgomeryForm**: True if scalars are in Montgomery form.
+- **arePointsOnDevice**: Indicates if points are located on the device.
+- **ArePointsMontgomeryForm**: True if point coordinates are in Montgomery form.
+- **areResultsOnDevice**: Indicates if results are stored on the device.
+- **IsBigTriangle**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
+- **IsAsync**: If true, runs MSM asynchronously.
+
+### Default Configuration
+
+Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be customized as needed.
+
+```go
+func GetDefaultMSMConfig() MSMConfig
+```
+
+
+## How do I toggle between the supported algorithms?
+
+When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
+
+```go
+...
+
+// Obtain the default MSM configuration.
+cfg := GetDefaultMSMConfig()
+
+cfg.Ctx.IsBigTriangle = true
+
+...
+```
+
+## How do I toggle between MSM modes?
+
+Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `MSM` function.
+
+The number of results is interpreted from the size of `var out core.DeviceSlice`. Thus its important when allocating memory for `var out core.DeviceSlice` to make sure that you are allocating `<number of results> X <size of a single point>`.
+
+```go
+... 
+
+batchSize := 3
+var p G2Projective
+var out core.DeviceSlice
+out.Malloc(batchSize*p.Size(), p.Size())
+
+...
+```
+
+## Support for G2 group
+
+To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled.
+
+```bash
+./build.sh bls12_381 ON
+```
+
+Now when importing `icicle`, you should have access to G2 features.
+
+```go
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+)
+```
+
+These features include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.
+
+```go
+...
+
+cfg := GetDefaultMSMConfig()
+size := 1 << 12
+batchSize := 3
+totalSize := size * batchSize
+scalars := GenerateScalars(totalSize)
+points := G2GenerateAffinePoints(totalSize)
+
+var p G2Projective
+var out core.DeviceSlice
+out.Malloc(batchSize*p.Size(), p.Size())
+G2Msm(scalars, points, &cfg, out)
+
+...
+```
+
+`G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
+
+Additionally when you are building your application make sure to use the g2 feature flag
+
+```bash
+go build -tags=g2
+```
--- a/docs/docs/icicle/golang-bindings/multi-gpu.md
+++ b/docs/docs/icicle/golang-bindings/multi-gpu.md
@@ -0,0 +1,139 @@
+# Multi GPU APIs
+
+To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
+
+Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
+
+
+## A Multi GPU example
+
+In this example we will display how you can
+
+1. Fetch the number of devices installed on a machine
+2. For every GPU launch a thread and set an active device per thread.
+3. Execute a MSM on each GPU
+
+
+```go
+func main() {
+	numDevices, _ := cuda_runtime.GetDeviceCount()
+	fmt.Println("There are ", numDevices, " devices available")
+	wg := sync.WaitGroup{}
+
+	for i := 0; i < numDevices; i++ {
+		wg.Add(1)
+        // RunOnDevice makes sure each MSM runs on a single thread
+		cuda_runtime.RunOnDevice(i, func(args ...any) {
+			defer wg.Done()
+			cfg := GetDefaultMSMConfig()
+			cfg.IsAsync = true
+			for _, power := range []int{10, 18} {
+				size := 1 << power // 2^pwr
+
+                // generate random scalars
+				scalars := GenerateScalars(size)
+				points := GenerateAffinePoints(size)
+
+                // create a stream and allocate result pointer
+				stream, _ := cuda_runtime.CreateStream()
+				var p Projective
+				var out core.DeviceSlice
+				_, e := out.MallocAsync(p.Size(), p.Size(), stream)
+                // assign stream to device context
+				cfg.Ctx.Stream = &stream
+
+                // execute MSM
+				e = Msm(scalars, points, &cfg, out)
+                // read result from device
+				outHost := make(core.HostSlice[Projective], 1)
+				outHost.CopyFromDeviceAsync(&out, stream)
+				out.FreeAsync(stream)
+
+                // sync the stream
+				cr.SynchronizeStream(&stream)
+			}
+		})
+	}
+	wg.Wait()
+}
+```
+
+This example demonstrates a basic pattern for distributing tasks across multiple GPUs. The `RunOnDevice` function ensures that each goroutine is executed on its designated GPU and a corresponding thread.
+
+## Device Management API
+
+To streamline device management we offer as part of `cuda_runtime` package methods for dealing with devices.
+
+### `RunOnDevice`
+
+Runs a given function on a specific GPU device, ensuring that all CUDA calls within the function are executed on the selected device.
+
+In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread. 
+
+`RunOnDevice` was designed to solve this caveat and insure that the goroutine will stay on a specific host thread.
+
+`RunOnDevice` will lock a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.
+
+While the goroutine is locked to the host thread, the Go runtime will not assign other goroutine's to that host thread.
+
+**Parameters:**
+
+- `deviceId int`: The ID of the device on which to run the provided function. Device IDs start from 0.
+- `funcToRun func(args ...any)`: The function to be executed on the specified device.
+- `args ...any`: Arguments to be passed to `funcToRun`.
+
+**Behavior:**
+
+- The function `funcToRun` is executed in a new goroutine that is locked to a specific OS thread to ensure that all CUDA calls within the function target the specified device.
+- It's important to note that any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
+
+**Example:**
+
+```go
+RunOnDevice(0, func(args ...any) {
+	fmt.Println("This runs on GPU 0")
+	// CUDA-related operations here will target GPU 0
+}, nil)
+```
+
+### `SetDevice`
+
+Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.
+
+**Parameters:**
+
+- `device int`: The ID of the device to set as the current device.
+
+**Returns:**
+
+- `CudaError`: Error code indicating the success or failure of the operation.
+
+### `GetDeviceCount`
+
+Retrieves the number of CUDA-capable devices available on the host.
+
+**Returns:**
+
+- `(int, CudaError)`: The number of devices and an error code indicating the success or failure of the operation.
+
+### `GetDevice`
+
+Gets the ID of the currently active device for the calling host thread.
+
+**Returns:**
+
+- `(int, CudaError)`: The ID of the current device and an error code indicating the success or failure of the operation.
+
+### `GetDeviceFromPointer`
+
+Retrieves the device associated with a given pointer.
+
+**Parameters:**
+
+- `ptr unsafe.Pointer`: Pointer to query.
+
+**Returns:**
+
+- `int`: The device ID associated with the memory pointed to by `ptr`.
+
+This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -0,0 +1,104 @@
+# NTT
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
+
+## NTT Example
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+
+func Main() {
+    // Obtain the default NTT configuration with a predefined coset generator.
+    cfg := GetDefaultNttConfig()
+    
+    // Define the size of the input scalars.
+    size := 1 << 18
+
+    // Generate scalars for the NTT operation.
+    scalars := GenerateScalars(size)
+
+    // Set the direction of the NTT (forward or inverse).
+    dir := core.KForward
+
+    // Allocate memory for the results of the NTT operation.
+    results := make(core.HostSlice[ScalarField], size)
+
+    // Perform the NTT operation.
+    err := Ntt(scalars, dir, &cfg, results)
+    if err != cr.CudaSuccess {
+        panic("NTT operation failed")
+    }
+}
+```
+
+## NTT Method
+
+```go
+func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
+```
+
+### Parameters
+
+- **scalars**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
+- **dir**: The direction of the NTT operation (`KForward` or `KInverse`).
+- **cfg**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
+- **results**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.
+
+### Return Value
+
+- **CudaError**: Returns a CUDA error code indicating the success or failure of the NTT operation.
+
+## NTT Configuration (NTTConfig)
+
+The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
+
+```go
+type NTTConfig[T any] struct {
+    Ctx cr.DeviceContext
+    CosetGen T
+    BatchSize int32
+    ColumnsBatch bool
+    Ordering Ordering
+    areInputsOnDevice  bool
+    areOutputsOnDevice bool
+    IsAsync bool
+    NttAlgorithm NttAlgorithm
+}
+```
+
+### Fields
+
+- **Ctx**: Device context containing details like device ID and stream ID.
+- **CosetGen**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
+- **BatchSize**: The number of NTTs to compute in one operation, defaulting to 1.
+- **ColumnsBatch**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+- **Ordering**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
+- **areInputsOnDevice**: Indicates if input scalars are located on the device.
+- **areOutputsOnDevice**: Indicates if results are stored on the device.
+- **IsAsync**: Controls whether the NTT operation runs asynchronously.
+- **NttAlgorithm**: Explicitly select the NTT algorithm. Default value: Auto (the implementation selects radix-2 or mixed-radix algorithm based on heuristics).
+
+### Default Configuration
+
+Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
+
+```go
+func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
+```
+
+### Initializing the NTT Domain
+
+Before performing NTT operations, it's necessary to initialize the NTT domain; it only needs to be called once per GPU since the twiddles are cached.
+
+```go
+func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bool) core.IcicleError
+```
+
+This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -0,0 +1,132 @@
+# Vector Operations
+
+## Overview
+
+The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
+
+## Example
+
+### Vector addition
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+
+func main() {
+    testSize := 1 << 12
+    a := GenerateScalars(testSize)
+    b := GenerateScalars(testSize)
+    out := make(core.HostSlice[ScalarField], testSize)
+    cfg := core.DefaultVecOpsConfig()
+
+    // Perform vector addition
+    err := VecOp(a, b, out, cfg, core.Add)
+    if err != cr.CudaSuccess {
+        panic("Vector addition failed")
+    }
+}
+```
+
+### Vector Subtraction
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+
+func main() {
+    testSize := 1 << 12
+    a := GenerateScalars(testSize)
+    b := GenerateScalars(testSize)
+    out := make(core.HostSlice[ScalarField], testSize)
+    cfg := core.DefaultVecOpsConfig()
+
+    // Perform vector subtraction
+    err := VecOp(a, b, out, cfg, core.Sub)
+    if err != cr.CudaSuccess {
+        panic("Vector subtraction failed")
+    }
+}
+```
+
+### Vector Multiplication
+
+```go
+package main
+
+import (
+    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
+    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+)
+
+func main() {
+    testSize := 1 << 12
+    a := GenerateScalars(testSize)
+    b := GenerateScalars(testSize)
+    out := make(core.HostSlice[ScalarField], testSize)
+    cfg := core.DefaultVecOpsConfig()
+
+    // Perform vector multiplication
+    err := VecOp(a, b, out, cfg, core.Mul)
+    if err != cr.CudaSuccess {
+        panic("Vector multiplication failed")
+    }
+}
+```
+
+## VecOps Method
+
+```go
+func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
+```
+
+### Parameters
+
+- **a**: The first input vector.
+- **b**: The second input vector.
+- **out**: The output vector where the result of the operation will be stored.
+- **config**: A `VecOpsConfig` object containing various configuration options for the vector operations.
+- **op**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.
+
+### Return Value
+
+- **CudaError**: Returns a CUDA error code indicating the success or failure of the vector operation.
+
+## VecOpsConfig
+
+The `VecOpsConfig` structure holds configuration parameters for the vector operations, allowing customization of its behavior.
+
+```go
+type VecOpsConfig struct {
+    Ctx cr.DeviceContext
+    isAOnDevice bool
+    isBOnDevice bool
+    isResultOnDevice bool
+    IsResultMontgomeryForm bool
+    IsAsync bool
+}
+```
+
+### Fields
+
+- **Ctx**: Device context containing details like device ID and stream ID.
+- **isAOnDevice**: Indicates if vector `a` is located on the device.
+- **isBOnDevice**: Indicates if vector `b` is located on the device.
+- **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
+- **IsResultMontgomeryForm**: Determines if the result vector should be in Montgomery form.
+- **IsAsync**: Controls whether the vector operation runs asynchronously.
+
+### Default Configuration
+
+Use `DefaultVecOpsConfig` to obtain a default configuration, customizable as needed.
+
+```go
+func DefaultVecOpsConfig() VecOpsConfig
+```
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -1,8 +1,8 @@
 # What is ICICLE?

-[![Static Badge](https://img.shields.io/badge/Latest-v1.4.0-8a2be2)](https://github.com/ingonyama-zk/icicle/releases)
+[![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)
+

-![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue)



--- a/docs/docs/icicle/polynomials/ffi.uml
+++ b/docs/docs/icicle/polynomials/ffi.uml
@@ -0,0 +1,27 @@
+@startuml
+skinparam componentStyle uml2
+
+' Define Components
+component "C++ Template\nComponent" as CppTemplate {
+  [Parameterizable Interface]
+}
+component "C API Wrapper\nComponent" as CApiWrapper {
+  [C API Interface]
+}
+component "Rust Code\nComponent" as RustCode {
+  [Macro Interface\n(Template Instantiation)]
+}
+
+' Define Artifact
+artifact "Static Library\n«artifact»" as StaticLib
+
+' Connections
+CppTemplate -down-> CApiWrapper : Instantiates
+CApiWrapper .down.> StaticLib : Compiles into
+RustCode -left-> StaticLib : Links against\nand calls via FFI
+
+' Notes
+note right of CppTemplate : Generic C++\ntemplate implementation
+note right of CApiWrapper : Exposes C API for FFI\nto Rust/Go
+note right of RustCode : Uses macros to\ninstantiate templates
+@enduml
--- a/docs/docs/icicle/polynomials/hw_backends.uml
+++ b/docs/docs/icicle/polynomials/hw_backends.uml
@@ -0,0 +1,86 @@
+@startuml
+
+' Define Interface for Polynomial Backend Operations
+interface IPolynomialBackend {
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Define Interface for Polynomial Context (State Management)
+interface IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' PolynomialAPI now uses two strategies: Backend and Context
+class PolynomialAPI {
+    -backendStrategy: IPolynomialBackend
+    -contextStrategy: IPolynomialContext
+    -setBackendStrategy(IPolynomialBackend)
+    -setContextStrategy(IPolynomialContext)
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Backend Implementations
+class GPUPolynomialBackend implements IPolynomialBackend {
+    #gpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class ZPUPolynomialBackend implements IPolynomialBackend {
+    #zpuResources: Resource
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+class TracerPolynomialBackend implements IPolynomialBackend {
+    #traceData: Data
+    +add()
+    +subtract()
+    +multiply()
+    +divide()
+    +evaluate()
+}
+
+' Context Implementations (Placeholder for actual implementation)
+class GPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class ZPUContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+class TracerContext implements IPolynomialContext {
+    +initFromCoeffs()
+    +initFromEvals()
+    +getCoeffs()
+    +getEvals()
+}
+
+' Relationships
+PolynomialAPI o-- IPolynomialBackend : uses
+PolynomialAPI o-- IPolynomialContext : uses
+@enduml
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -49,13 +49,17 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per

 You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).

-# Using MSM
-
 ## Supported curves

 MSM supports the following curves:

-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
+`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
+
+
+## Supported Bindings
+
+- [Golang](../golang-bindings/msm.md)
+- [Rust](../rust-bindings//msm.md)

 ## Supported algorithms

@@ -79,25 +83,6 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu

 The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.

-
-### How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
-
-```rust
-...
-
-let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
-
-// is_big_triangle will determine which algorithm to use 
-cfg_bls12377.is_big_triangle = true;
-
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-...
-```
-
-You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
-
 ## MSM Modes

 ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
@@ -109,54 +94,3 @@ Batch MSM allows you to run many MSMs with a single API call, Single MSM will la
 This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.

 Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
-
-### How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
-
-```rust
-...
-
-let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
-
-...
-```
-
-In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
-
-
-In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
-
-```rust
-...
-
-let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-
-...
-```
-
-Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
-
-
-## Support for G2 group
-
-MSM also supports G2 group. 
-
-Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
-
-```rust
-... 
-
-let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
-
-msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
-
-...
-```
-
-Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -28,6 +28,10 @@ NTT supports the following curves:

 `bls12-377`, `bls12-381`, `bn-254`, `bw6-761`

+## Supported Bindings
+
+- [Golang](../golang-bindings/ntt.md)
+- [Rust](../rust-bindings/ntt.md)

 ### Examples

@@ -35,87 +39,6 @@ NTT supports the following curves:

 - [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)

-## NTT API overview
-
-```rust
-pub fn ntt<F>(
-    input: &HostOrDeviceSlice<F>,
-    dir: NTTDir,
-    cfg: &NTTConfig<F>,
-    output: &mut HostOrDeviceSlice<F>,
-) -> IcicleResult<()>
-```
-
-`ntt:ntt` expects:
-
-`input` - buffer to read the inputs of the NTT from. <br/>
-`dir` - whether to compute forward or inverse NTT. <br/>
-`cfg` - config used to specify extra arguments of the NTT. <br/>
-`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
-
-The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
-
-### NTT Config
-
-```rust
-pub struct NTTConfig<'a, S> {
-    pub ctx: DeviceContext<'a>,
-    pub coset_gen: S,
-    pub batch_size: i32,
-    pub ordering: Ordering,
-    are_inputs_on_device: bool,    
-    are_outputs_on_device: bool,
-    pub is_async: bool,
-    pub ntt_algorithm: NttAlgorithm,
-}
-```
-
-The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
-
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
-
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
-
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
-
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
-
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
-
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
-
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
-`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
-
-#### Usage
-
-Example initialization with default settings:
-
-```rust
-let default_config = NTTConfig::default();
-```
-
-Customizing the configuration:
-
-```rust
-let custom_config = NTTConfig {
-    ctx: custom_device_context,
-    coset_gen: my_coset_generator,
-    batch_size: 10,
-    ordering: Ordering::kRN,
-    are_inputs_on_device: true,
-    are_outputs_on_device: true,
-    is_async: false,
-    ntt_algorithm: NttAlgorithm::MixedRadix,
-};
-```
-
 ### Ordering

 The `Ordering` enum defines how inputs and outputs are arranged for the NTT operation, offering flexibility in handling data according to different algorithmic needs or compatibility requirements. It primarily affects the sequencing of data points for the transform, which can influence both performance and the compatibility with certain algorithmic approaches. The available ordering options are:
@@ -140,15 +63,6 @@ NTT also supports two different modes `Batch NTT` and `Single NTT`

 Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.

-You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
-
-```rust
-let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
-cfg.batch_size = 10 // your ntt using this config will run in batch mode.
-```
-
-`batch_size=1` would keep our NTT in single NTT mode.
-
 Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.

 **Single NTT Mode**
@@ -232,9 +146,11 @@ Mixed Radix can reduce the number of stages required to compute for large inputs

 ### Which algorithm should I choose ?

-Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Its also more suited for inputs which are power of 2 (e.g., 256, 512, 1024). Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
+Both work only on inputs of power of 2 (e.g., 256, 512, 1024).

-Mixed radix on the other hand better for larger NTTs with larger input sizes which are not necessarily power of 2.
+Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
+
+Mixed radix on the other hand works better for larger NTTs with larger input sizes.

 Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -6,5 +6,6 @@ This section of the documentation is dedicated to the ICICLE primitives, we will
 ## Supported primitives


- [MSM](./msm)
+- [MSM](./msm.md)
+- [NTT](./ntt.md)
 - [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -0,0 +1,63 @@
+# MSM Pre computation
+
+To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `Grumpkin`
+
+### `precompute_bases`
+
+Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
+
+```rust
+pub fn precompute_bases<C: Curve + MSM<C>>(
+    points: &HostOrDeviceSlice<Affine<C>>,
+    precompute_factor: i32,
+    _c: i32,
+    ctx: &DeviceContext,
+    output_bases: &mut HostOrDeviceSlice<Affine<C>>,
+) -> IcicleResult<()>
+```
+
+
+#### Parameters
+
+- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
+- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
+- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
+- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
+- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
+
+#### Returns
+
+`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
+
+#### Description
+
+This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
+
+The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
+
+#### Example Usage
+
+```rust
+let device_context = DeviceContext::default_for_device(0); // Use the default device
+let precompute_factor = 4; // Number of points to precompute
+let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");
+
+// Precompute the bases using the specified factor
+precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
+    .expect("Failed to precompute bases");
+```
+
+### Benchmarks
+
+Benchmarks where performed on a Nvidia RTX 3090Ti.
+
+| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
+| ------------- | ------------- | ------------- | ------------- | ------------- |
+| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
+| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
+| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
+| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -0,0 +1,172 @@
+# MSM
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
+
+## Example
+
+```rust
+use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarCfg};
+use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
+use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
+
+fn main() {
+    let size: usize = 1 << 10; // Define the number of points and scalars
+
+    // Generate random points and scalars
+    println!("Generating random G1 points and scalars for BN254...");
+    let points = CurveCfg::generate_random_affine_points(size);
+    let scalars = ScalarCfg::generate_random(size);
+
+    // Wrap points and scalars in HostOrDeviceSlice for MSM
+    let points_host = HostOrDeviceSlice::Host(points);
+    let scalars_host = HostOrDeviceSlice::Host(scalars);
+
+    // Allocate memory on the CUDA device for MSM results
+    let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).expect("Failed to allocate CUDA memory for MSM results");
+
+    // Create a CUDA stream for asynchronous execution
+    let stream = CudaStream::create().expect("Failed to create CUDA stream");
+    let mut cfg = msm::MSMConfig::default();
+    cfg.ctx.stream = &stream;
+    cfg.is_async = true; // Enable asynchronous execution
+
+    // Execute MSM on the device
+    println!("Executing MSM on device...");
+    msm::msm(&scalars_host, &points_host, &cfg, &mut msm_results).expect("Failed to execute MSM");
+
+    // Synchronize CUDA stream to ensure MSM execution is complete
+    stream.synchronize().expect("Failed to synchronize CUDA stream");
+
+    // Optionally, move results to host for further processing or printing
+    println!("MSM execution complete.");
+}
+```
+
+## MSM API Overview
+
+```rust
+pub fn msm<C: Curve>(
+    scalars: &HostOrDeviceSlice<C::ScalarField>,
+    points: &HostOrDeviceSlice<Affine<C>>,
+    cfg: &MSMConfig,
+    results: &mut HostOrDeviceSlice<Projective<C>>,
+) -> IcicleResult<()>
+```
+
+### Parameters
+
+- **`scalars`**: A buffer containing the scalar values to be multiplied with corresponding points.
+- **`points`**: A buffer containing the points to be multiplied by the scalars.
+- **`cfg`**: MSM configuration specifying additional parameters for the operation.
+- **`results`**: A buffer where the results of the MSM operations will be stored.
+
+### MSM Config
+
+```rust
+pub struct MSMConfig<'a> {
+    pub ctx: DeviceContext<'a>,
+    points_size: i32,
+    pub precompute_factor: i32,
+    pub c: i32,
+    pub bitsize: i32,
+    pub large_bucket_factor: i32,
+    batch_size: i32,
+    are_scalars_on_device: bool,
+    pub are_scalars_montgomery_form: bool,
+    are_points_on_device: bool,
+    pub are_points_montgomery_form: bool,
+    are_results_on_device: bool,
+    pub is_big_triangle: bool,
+    pub is_async: bool,
+}
+```
+
+- **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
+- **`point_size: i32`**: 
+- **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
+- **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
+- **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
+- **`large_bucket_factor: i32`**: Adjusts the algorithm's sensitivity to frequently occurring buckets, useful for non-uniform scalar distributions.
+- **`batch_size: i32`**: The number of MSMs to compute in a single batch, for leveraging parallelism.
+- **`are_scalars_montgomery_form`**: Set to `true` if scalars are in montgomery form.
+- **`are_points_montgomery_form`**: Set to `true` if points are in montgomery form.
+- **`are_scalars_on_device: bool`**, **`are_points_on_device: bool`**, **`are_results_on_device: bool`**: Indicate whether the corresponding buffers are on the device memory.
+- **`is_big_triangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
+- **`is_async: bool`**: Whether to perform the MSM operation asynchronously.
+
+### Usage
+
+The `msm` function is designed to compute the sum of multiple scalar-point multiplications efficiently. It supports both single MSM operations and batched operations for increased performance. The configuration allows for detailed control over the execution environment and performance characteristics of the MSM operation.
+
+When performing MSM operations, it's crucial to match the size of the `scalars` and `points` arrays correctly and ensure that the `results` buffer is appropriately sized to hold the output. The `MSMConfig` should be set up to reflect the specifics of the operation, including whether the operation should be asynchronous and any device-specific settings.
+
+## How do I toggle between the supported algorithms?
+
+When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
+
+```rust
+...
+
+let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+
+// is_big_triangle will determine which algorithm to use 
+cfg_bls12377.is_big_triangle = true;
+
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+...
+```
+
+You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
+
+
+## How do I toggle between MSM modes?
+
+Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
+
+```rust
+...
+
+let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
+
+...
+```
+
+In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
+
+
+In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
+
+```rust
+...
+
+let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+
+...
+```
+
+Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
+
+## Support for G2 group
+
+MSM also supports G2 group. 
+
+Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
+
+```rust
+... 
+
+let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
+
+msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+
+...
+```
+
+Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -4,6 +4,54 @@ To learn more about the theory of Multi GPU programming refer to [this part](../

 Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)

+
+## A Multi GPU example
+
+In this example we will display how you can
+
+1. Fetch the number of devices installed on a machine
+2. For every GPU launch a thread and set an active device per thread.
+3. Execute a MSM on each GPU
+
+
+
+```rust
+
+...
+
+let device_count = get_device_count().unwrap();
+
+(0..device_count)
+        .into_par_iter()
+        .for_each(move |device_id| {
+          set_device(device_id).unwrap();
+
+          // you can allocate points and scalars_d here
+
+          let mut cfg = MSMConfig::default_for_device(device_id);
+          cfg.ctx.stream = &stream;
+          cfg.is_async = true;
+          cfg.are_scalars_montgomery_form = true;
+          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
+
+          // collect and process results
+        })
+
+...
+```
+
+
+We use `get_device_count` to fetch the number of connected devices, device IDs will be `0, 1, 2, ..., device_count - 1`
+
+[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
+
+We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
+
+Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
+
+We finally call our `msm` method.
+
+
 ## Device management API

 To streamline device management we offer as part of `icicle-cuda-runtime` package methods for dealing with devices.
@@ -152,50 +200,3 @@ let device_id: i32 = 0; // Example device ID
 check_device(device_id);
 // Ensures that the current context is correctly set for the specified device ID.
 ```
-
-
-## A Multi GPU example
-
-In this example we will display how you can
-
-1. Fetch the number of devices installed on a machine
-2. For every GPU launch a thread and set a active device per thread.
-3. Execute a MSM on each GPU
-
-
-
-```rust
-
-...
-
-let device_count = get_device_count().unwrap();
-
-(0..device_count)
-        .into_par_iter()
-        .for_each(move |device_id| {
-          set_device(device_id).unwrap();
-
-          // you can allocate points and scalars_d here
-
-          let mut cfg = MSMConfig::default_for_device(device_id);
-          cfg.ctx.stream = &stream;
-          cfg.is_async = true;
-          cfg.are_scalars_montgomery_form = true;
-          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
-
-          // collect and process results
-        })
-
-...
-```
-
-
-We use `get_device_count` to fetch the number of connected devices, device IDs will be `0...device_count-1`
-
-[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
-
-We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
-
-Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
-
-We finally call our `msm` method.
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -0,0 +1,199 @@
+# NTT
+
+### Supported curves
+
+`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
+
+## Example 
+
+```rust
+use icicle_bn254::curve::{ScalarCfg, ScalarField};
+use icicle_core::{ntt::{self, NTT}, traits::GenerateRandom};
+use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
+
+fn main() {
+    let size = 1 << 12; // Define the size of your input, e.g., 2^10
+
+    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
+        size.try_into()
+            .unwrap(),
+    )
+
+    // Generate random inputs
+    println!("Generating random inputs...");
+    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
+
+    // Allocate memory on CUDA device for NTT results
+    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).expect("Failed to allocate CUDA memory");
+
+    // Create a CUDA stream
+    let stream = CudaStream::create().expect("Failed to create CUDA stream");
+    let ctx = DeviceContext::default(); // Assuming default device context
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    // Configure NTT
+    let mut cfg = ntt::NTTConfig::default();
+    cfg.ctx.stream = &stream;
+    cfg.is_async = true; // Set to true for asynchronous execution
+
+    // Execute NTT on device
+    println!("Executing NTT on device...");
+    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).expect("Failed to execute NTT");
+
+    // Synchronize CUDA stream to ensure completion
+    stream.synchronize().expect("Failed to synchronize CUDA stream");
+
+    // Optionally, move results to host for further processing or verification
+    println!("NTT execution complete.");
+}
+```
+
+## NTT API overview
+
+```rust
+pub fn ntt<F>(
+    input: &HostOrDeviceSlice<F>,
+    dir: NTTDir,
+    cfg: &NTTConfig<F>,
+    output: &mut HostOrDeviceSlice<F>,
+) -> IcicleResult<()>
+```
+
+`ntt:ntt` expects:
+
+`input` - buffer to read the inputs of the NTT from. <br/>
+`dir` - whether to compute forward or inverse NTT. <br/>
+`cfg` - config used to specify extra arguments of the NTT. <br/>
+`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
+
+The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
+
+
+### NTT Config
+
+```rust
+pub struct NTTConfig<'a, S> {
+    pub ctx: DeviceContext<'a>,
+    pub coset_gen: S,
+    pub batch_size: i32,
+    pub columns_batch: bool,
+    pub ordering: Ordering,
+    are_inputs_on_device: bool,    
+    are_outputs_on_device: bool,
+    pub is_async: bool,
+    pub ntt_algorithm: NttAlgorithm,
+}
+```
+
+The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
+
+#### Fields
+
+- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
+
+- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
+
+- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
+
+- **`columns_batch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
+
+- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
+
+- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
+
+- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
+
+- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
+
+- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
+`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
+`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
+
+
+#### Usage
+
+Example initialization with default settings:
+
+```rust
+let default_config = NTTConfig::default();
+```
+
+Customizing the configuration:
+
+```rust
+let custom_config = NTTConfig {
+    ctx: custom_device_context,
+    coset_gen: my_coset_generator,
+    batch_size: 10,
+    columns_batch: false,
+    ordering: Ordering::kRN,
+    are_inputs_on_device: true,
+    are_outputs_on_device: true,
+    is_async: false,
+    ntt_algorithm: NttAlgorithm::MixedRadix,
+};
+```
+
+
+### Modes
+
+NTT supports two different modes `Batch NTT` and `Single NTT`
+
+You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
+
+```rust
+let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+cfg.batch_size = 10 // your ntt using this config will run in batch mode.
+```
+
+`batch_size=1` would keep our NTT in single NTT mode.
+
+Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
+
+### Initializing the NTT Domain
+
+Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.
+
+```rust
+ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+```
+
+### `initialize_domain`
+
+```rust
+pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: NTT<F>;
+```
+
+#### Parameters
+
+- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
+
+- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
+
+#### Returns
+
+- **`IcicleResult<()>`**: Will return an error if the operation fails.
+
+### `initialize_domain_fast_twiddles_mode`
+
+Similar to `initialize_domain`, `initialize_domain_fast_twiddles_mode` is a faster implementation and can be used for larger NTTs.
+
+```rust
+pub fn initialize_domain_fast_twiddles_mode<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: NTT<F>;
+```
+
+#### Parameters
+
+- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
+
+- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
+
+#### Returns
+
+- **`IcicleResult<()>`**: Will return an error if the operation fails.
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -9,6 +9,57 @@ Vector operations are supported on the following curves:

 `bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`

+## Examples
+
+### Addition of Scalars
+
+```rust
+use icicle_bn254::curve::{ScalarCfg, ScalarField};
+use icicle_core::vec_ops::{add_scalars};
+
+let test_size = 1 << 18;
+
+let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
+let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
+let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
+
+let cfg = VecOpsConfig::default();
+add_scalars(&a, &b, &mut result, &cfg).unwrap();
+```
+
+### Subtraction of Scalars
+
+```rust
+use icicle_bn254::curve::{ScalarCfg, ScalarField};
+use icicle_core::vec_ops::{sub_scalars};
+
+let test_size = 1 << 18;
+
+let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
+let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
+let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
+
+let cfg = VecOpsConfig::default();
+sub_scalars(&a, &b, &mut result, &cfg).unwrap();
+```
+
+### Multiplication of Scalars
+
+```rust
+use icicle_bn254::curve::{ScalarCfg, ScalarField};
+use icicle_core::vec_ops::{mul_scalars};
+
+let test_size = 1 << 18;
+
+let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
+let ones: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::one(); test_size]);
+let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
+
+let cfg = VecOpsConfig::default();
+mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
+```
+
+
 ## Vector Operations Configuration

 The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
@@ -101,43 +152,8 @@ pub trait VecOps<F> {

 #### Methods

+All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
+
 - **`add`**: Computes the element-wise sum of two vectors.
 - **`sub`**: Computes the element-wise difference between two vectors.
 - **`mul`**: Performs element-wise multiplication of two vectors.
-
-### Argument Validation
-
-Before invoking any of the above vector operations, we always call `check_vec_ops_args`, to make sure that inputs `a` and `b` can be operated on with and that the results pointer can contain the result:
-
-```rust
-fn check_vec_ops_args<F>(a: &HostOrDeviceSlice<F>, b: &HostOrDeviceSlice<F>, result: &mut HostOrDeviceSlice<F>) {
-    if a.len() != b.len() || a.len() != result.len() {
-        panic!(
-            "left, right and output lengths {}; {}; {} do not match",
-            a.len(),
-            b.len(),
-            result.len()
-        );
-    }
-}
-```
-
-### Examples
-
-#### Addition of Scalars
-
-```rust
-
-```
-
-#### Subtraction of Scalars
-
-```rust
-
-```
-
-#### Multiplication of Scalars
-
-```rust
-
-```
--- a/docs/docs/icicle/supporting-additional-curves.md
+++ b/docs/docs/icicle/supporting-additional-curves.md
@@ -6,7 +6,56 @@ We understand the need for ZK developers to use different curves, some common so

 ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.

-To add support a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
+To add support for a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
+
+### Adding curve_name_params.cuh
+
+Start by copying `bn254_params.cuh` contents in your params file. Params should include:
+ - **fq_config** - parameters of the Base field.
+    - **limbs_count** - `ceil(field_byte_size / 4)`.
+    - **modulus_bit_count** - bit-size of the modulus.
+    - **num_of_reductions** - the number of times to reduce in reduce function. Use 2 if not sure.
+    - **modulus** - modulus of the field.
+    - **modulus_2** - modulus * 2.
+    - **modulus_4** - modulus * 4. 
+    - **neg_modulus** - negated modulus. 
+    - **modulus_wide** - modulus represented as a double-sized integer.
+    - **modulus_squared** - modulus**2 represented as a double-sized integer.
+    - **modulus_squared_2** - 2 * modulus**2 represented as a double-sized integer.
+    - **modulus_squared_4** - 4 * modulus**2 represented as a double-sized integer.
+    - **m** - value used in multiplication. Can be computed as `2**(2*modulus_bit_count) // modulus`. 
+    - **one** - multiplicative identity. 
+    - **zero** - additive identity. 
+    - **montgomery_r** - `2 ** M % modulus` where M is a closest (larger than) bitsize multiple of 32. E.g. 384 or 768 for bls and bw curves respectively
+    - **montgomery_r_inv** - `2 ** (-M) % modulus`
+ - **fp_config** - parameters of the Scalar field.
+    Same as fq_config, but with additional arguments:
+    - **omegas_count** - [two-adicity](https://cryptologie.net/article/559/whats-two-adicity/) of the field. And thus the maximum size of NTT.
+    - **omegas** - an array of omegas for NTTs. An array of size `omegas_count`. The ith element is equal to `1.nth_root(2**(2**(omegas_count-i)))`.
+    - **inv** - an array of inverses of powers of two in a field. Ith element is equal to `(2 ** (i+1)) ** -1`.
+ - **G1 generators points** - affine coordinates of the generator point.
+ - **G2 generators points** - affine coordinates of the extension generator. Remove these if `G2` is not supported.
+ - **Weierstrass b value** - base field element equal to value of `b` in the curve equation.
+ - **Weierstrass b value G2** - base field element equal to value of `b` for the extension. Remove this if `G2` is not supported.
+ 
+ :::note
+
+ All the params are not in Montgomery form.
+ 
+ :::
+ 
+ :::note
+
+ To convert number values into `storage` type you can use the following python function
+
+```python
+import struct
+
+def unpack(x, field_size):
+    return ', '.join(["0x" + format(x, '08x') for x in struct.unpack('I' * (field_size) // 4, int(x).to_bytes(field_size, 'little'))])
+```
+
+:::

 We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.

@@ -28,58 +77,40 @@ Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicl
 Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.

 ```
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;<curve_name>)
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin;<curve_name>)
 ```

+### Adding Poseidon support
+
+If you want your curve to implement a Poseidon hash function or a tree builder, you will need to pre-calculate its optimized parameters.  
+Copy [constants_template.h](https://github.com/ingonyama-zk/icicle/blob/main/icicle/appUtils/poseidon/constants/constants_template.h) into `icicle/appUtils/poseidon/constants/<CURVE>_poseidon.h`. Run the [constants generation script](https://dev.ingonyama.com/icicle/primitives/poseidon#constants). The script will print the number of partial rounds and generate a `constants.bin` file. Use `xxd -i constants.bin` to parse the file into C declarations. Copy the `unsigned char constants_bin[]` contents inside your new file. Repeat this process for arities 2, 4, 8 and 11.
+
+After you've generated the constants, add your curve in this [SUPPORTED_CURVES_WITH_POSEIDON](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L72) in the `CMakeLists.txt`.
+
 ## Bindings

-In order to support a new curves in the binding libraries you first must support it in ICICLE core.
+In order to support a new curve in the binding libraries you first must support it in ICICLE core.

 ### Rust

-Create a new folder named `icicle-<curve_name>` under the [rust wrappers folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves). Your new directory should look like this.
+Go to [rust curves folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves) and copy `icicle-curve-template` to a new folder named `icicle-<curve_name>`.

-```
-└── rust
-    ├── icicle-curves
-        ├── icicle-<curve_name>
-    │   │   ├── Cargo.toml
-    │   │   ├── build.rs
-    │   │   └── src/
-    │   │       ├── curve.rs
-    │   │       ├── lib.rs
-    │   │       ├── msm/
-    │   │       │   └── mod.rs
-    │   │       └── ntt/
-    │   │           └── mod.rs
-```
+Find all the occurrences of `<CURVE>` placeholder inside the crate. (You can use `Ctrl+Shift+F` in VS Code or `grep -nr "<CURVE>"` in bash). You will then need to replace each occurrence with your new curve name.

-Lets look at [`ntt/mod.rs`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs) for example.
+#### Limbs

-```
-...
+Go to your curve's `curve.rs` file and set `SCALAR_LIMBS`, `BASE_LIMBS` and `G2_BASE_LIMBS` (if G2 is needed) to a minimum number of `u64` required to store a single scalar field / base field element respectively.  
+e.g. for bn254, scalar field is 254 bit so `SCALAR_LIMBS` is set to 4.

-extern "C" {
-    #[link_name = "bn254NTTCuda"]
-    fn ntt_cuda<'a>(
-        input: *const ScalarField,
-        size: usize,
-        is_inverse: bool,
-        config: &NTTConfig<'a, ScalarField>,
-        output: *mut ScalarField,
-    ) -> CudaError;
+#### Primitives

-    #[link_name = "bn254DefaultNTTConfig"]
-    fn default_ntt_config() -> NTTConfig<'static, ScalarField>;
+If your curve doesn't support some of the primitives (ntt/msm/poseidon/merkle tree/), or you simply don't want to include it, just remove a corresponding module from `src` and then from `lib.rs`

-    #[link_name = "bn254InitializeDomain"]
-    fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
-}
+#### G2

-...
-```
+If your curve doesn't support G2 - remove all the code under `#[cfg(feature = "g2")]` and remove the feature from [Cargo.toml](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml#L29) and [build.rs](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs#L15).

-Here you would need to replace `bn254NTTCuda` with `<curve_name>NTTCuda`. Most of these changes are pretty straight forward. One thing you should pay attention to is limb sizes as these change for different curves. For example `BN254` [has limb size of 8](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs#L15) but for your curve this may be different.
+After this is done, add your new crate in the [global Cargo.toml](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/Cargo.toml).

 ### Golang

--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -29,13 +29,13 @@ const config = {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          sidebarPath: require.resolve('./sidebars.js'),
-          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
        },
        blog: {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          showReadingTime: true,
-          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
        },
        pages: {},
        theme: {
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -25,9 +25,46 @@ module.exports = {
          id: "icicle/integrations"
        },
        {
-          type: "doc",
+          type: "category",
          label: "Golang bindings",
-          id: "icicle/golang-bindings",
+          link: {
+            type: `doc`,
+            id: "icicle/golang-bindings",
+          },
+          collapsed: true,
+          items: [
+            {
+              type: "category",
+              label: "MSM",
+              link: {
+                type: `doc`,
+                id: "icicle/golang-bindings/msm",
+              },
+              collapsed: true,
+              items: [
+                {
+                  type: "doc",
+                  label: "MSM pre computation",
+                  id: "icicle/golang-bindings/msm-pre-computation",
+                }
+              ]
+            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/golang-bindings/ntt",
+            },
+            {
+              type: "doc",
+              label: "Vector operations",
+              id: "icicle/golang-bindings/vec-ops",
+            },
+            {
+            type: "doc",
+            label: "Multi GPU Support",
+            id: "icicle/golang-bindings/multi-gpu",
+            },
+          ]
        },
        {
          type: "category",
@@ -38,17 +75,38 @@ module.exports = {
          },
          collapsed: true,
          items: [
+            {
+              type: "category",
+              label: "MSM",
+              link: {
+                type: `doc`,
+                id: "icicle/rust-bindings/msm",
+              },
+              collapsed: true,
+              items: [
+                {
+                  type: "doc",
+                  label: "MSM pre computation",
+                  id: "icicle/rust-bindings/msm-pre-computation",
+                }
+              ]
+            },
            {
              type: "doc",
-              label: "Multi GPU Support",
-              id: "icicle/rust-bindings/multi-gpu",
+              label: "NTT",
+              id: "icicle/rust-bindings/ntt",
            },
            {
              type: "doc",
              label: "Vector operations",
              id: "icicle/rust-bindings/vec-ops",
            },
-          ]
+            {
+              type: "doc",
+              label: "Multi GPU Support",
+              id: "icicle/rust-bindings/multi-gpu",
+            },
+          ],
        },
        {
          type: "category",
@@ -66,14 +124,14 @@ module.exports = {
            },
            {
              type: "doc",
-              label: "Poseidon Hash",
-              id: "icicle/primitives/poseidon",
+              label: "NTT",
+              id: "icicle/primitives/ntt",
            },
            {
              type: "doc",
-              label: "NTT",
-              id: "icicle/primitives/ntt",
-            }
+              label: "Poseidon Hash",
+              id: "icicle/primitives/poseidon",
+            },
          ],
        },
        {
--- a/examples/c++/pedersen-commitment/CMakeLists.txt
+++ b/examples/c++/pedersen-commitment/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/pedersen-commitment/README.md
+++ b/examples/c++/pedersen-commitment/README.md
@@ -0,0 +1,33 @@
+# ICICLE example: Pedersen Commitment
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
+
+Pedersen commitment is based on Multi-Scalar Multiplication [MSM](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+`ICICLE` provides CUDA C++ support for [MSM](https://dev.ingonyama.com/icicle/primitives/msm). 
+An example of MSM is [here](../msm/README.md).
+
+## Running the example
+
+- `cd` to your example directory
+- compile with  `./compile.sh`
+- run with `./run.sh`
+
+## Concise Explanation
+
+We recommend this simple [explanation](https://www.rareskills.io/post/pedersen-commitment).
+
+The original paper: T. P. Pedersen, "Non-Interactive and Information-Theoretic Secure Verifiable Secret Sharing," in Advances in Cryptology — CRYPTO ’91, Lecture Notes in Computer Science, vol 576. Springer, Berlin, Heidelberg.
+
+## What's in the example
+
+1. Define the curve and the size of commitment vector
+2. Use public random seed to transparently generate points on the elliptic curve without known discrete logarithm
+3. Generate (random) commitment vector and salt (a.k.a blinding factor)
+4. Configure and execute MSM using on-host data
+5. Output commitment as elliptic point
--- a/examples/c++/pedersen-commitment/compile.sh
+++ b/examples/c++/pedersen-commitment/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -0,0 +1,159 @@
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <cassert>
+#include <nvml.h>
+
+#define CURVE_ID BN254
+#include "appUtils/msm/msm.cu"
+using namespace curve_config;
+
+typedef point_field_t T;
+
+// modular power
+T modPow(T base, T exp) {
+  T r = T::one();
+  T b = base;
+  T e = exp;
+  while (e != T::zero()) {
+      // If exp is odd, multiply the base with result
+      if (T::is_odd(e)) {
+          r = r * b;
+      }
+      // Now exp must be even, divide it by 2
+      e =T::div2(e);
+      b = b * b;
+  }
+  return r;
+}
+
+// Check if y2 is a quadratic residue using Euler's Criterion
+bool quadratic_residue(T y2) {
+  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
+}
+
+// modular square root adapted from:
+// https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
+bool mySQRT(T a, T *result) {
+  if (a == T::zero()) {
+    *result = T::zero();
+    return true;
+  }
+  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
+    return false; // solution does not exist
+  }
+  // TODO: consider special cases
+  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
+  T s = T::zero() - T::one(); // p - 1, 
+  T n = T::one() + T::one(); //2;
+  T r = T::zero(); 
+  T m;
+  while (T::is_even(s)) {
+    r = r + T::one();
+    s = T::div2(s); //s /= 2;
+  }
+  // find a non-square mod p
+  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
+    n = n + T::one();
+  }
+  T x = modPow(a, T::div2(s + T::one()));
+  T b = modPow(a, s);
+  T g = modPow(n, s);
+  for (;; r = m) {
+    T t = b;
+    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
+    if (m == T::zero() ) {
+      *result = x;
+      return true;
+    }
+    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
+    g = gs * gs ;
+    x = x * gs ;
+    b =  b * g ;
+  }
+}
+
+void point_near_x(T x, affine_t *point) {
+  const T wb = T { weierstrass_b };
+  T y2;
+  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
+  {
+    x = x + T::one();
+  };
+  T y;
+  bool found = mySQRT(y2, &y);
+  assert(y*y == y2);
+  point->x = x;
+  point->y = y;
+}
+
+static int seed = 0;
+static HOST_INLINE T rand_host_seed()
+  {
+    std::mt19937_64 generator(seed++);
+    std::uniform_int_distribution<unsigned> distribution;
+    
+    T value;
+    for (unsigned i = 0; i <  T::TLC-1 ; i++)
+    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
+      value.limbs_storage.limbs[i] = distribution(generator);
+    // while (lt(Field{get_modulus()}, value))
+    //   value = value - Field{get_modulus()};
+    return value;
+  }
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char** argv)
+{
+  const unsigned N = pow(2, 10);
+  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
+  T* xs = new T[N+1];
+  
+  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
+  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
+  seed = 1234;
+  std::cout << "Using seed: " << seed << std::endl;
+  std::cout << "Generating random field values" << std::endl;
+  START_TIMER(gen);
+  
+  for (unsigned i = 0; i < N; i++) {
+    xs[i] = rand_host_seed();
+  }
+  END_TIMER(gen, "Time to generate field values");
+  std::cout << "xs[0]: " << xs[0]  << std::endl;
+  std::cout << "xs[1]: " << xs[1]  << std::endl;
+  
+  // affine_t points[N];
+  affine_t* points = new affine_t[N+1];
+  std::cout << "Generating point about random field values" << std::endl;
+  START_TIMER(points);
+  for (unsigned i = 0; i < N+1; i++) {
+    point_near_x(xs[i], &points[i]);
+  }
+  END_TIMER(points, "Time to generate points");
+  
+  std::cout << "Generating commitment vector" << std::endl;
+  projective_t result;
+  scalar_t* scalars = new scalar_t[N+1];
+  scalar_t::RandHostMany(scalars, N);
+
+  std::cout << "Generating salt" << std::endl;
+  scalars[N] = scalar_t::rand_host();
+
+  std::cout << "Executing MSM" << std::endl;
+  auto config = msm::DefaultMSMConfig<scalar_t>();
+  START_TIMER(msm);
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, N+1, config, &result);
+  END_TIMER(msm, "Time to execute MSM");
+
+  std::cout << "Computed commitment: " << result << std::endl;
+
+  std::cout << "Cleaning up..." << std::endl;
+  delete[] xs;
+  delete[] scalars;
+  delete[] points;
+  return 0;
+}
--- a/examples/c++/pedersen-commitment/run.sh
+++ b/examples/c++/pedersen-commitment/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -84,16 +84,14 @@ int main(int argc, char** argv)

      // (4) multiply A,B
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-      vec_ops::VecOpsConfig<test_data> config {
+      vec_ops::VecOpsConfig<test_data> config{
        ntt_config.ctx,
        true,  // is_a_on_device
        true,  // is_b_on_device
        true,  // is_result_on_device
-        false, // is_montgomery
        false  // is_async
      };
-      CHK_IF_RETURN(
-        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
+      CHK_IF_RETURN(vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));

      // (5) INTT (in place)
      ntt_config.are_inputs_on_device = true;
@@ -118,6 +116,7 @@ int main(int argc, char** argv)
  benchmark(false); // warmup
  benchmark(true, 20);

+  ntt::ReleaseDomain<test_scalar>(ntt_config.ctx);
  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));

  return 0;
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -8,12 +8,11 @@ icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
 icicle-core = { path = "../../../wrappers/rust/icicle-core" }
 icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
 icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
-ark-bn254 = { version = "0.4.0", optional = true}
-ark-bls12-377 = { version = "0.4.0", optional = true}
-ark-ec = { version = "0.4.0", optional = true}
+ark-bn254 = { version = "0.4.0", optional = true }
+ark-bls12-377 = { version = "0.4.0", optional = true }
+ark-ec = { version = "0.4.0", optional = true }
 clap = { version = "<=4.4.12", features = ["derive"] }

 [features]
 arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
 profile = []
-g2 = []
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -4,7 +4,10 @@ use icicle_bls12_377::curve::{
    CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
 };

-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_cuda_runtime::{
+    memory::{DeviceVec, HostSlice},
+    stream::CudaStream,
+};

 use icicle_core::{curve::Curve, msm, traits::GenerateRandom};

@@ -57,18 +60,18 @@ fn main() {
            log_size, size
        );
        // Setting Bn254 points and scalars
-        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
-        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+        let points = HostSlice::from_slice(&upper_points[..size]);
+        let g2_points = HostSlice::from_slice(&g2_upper_points[..size]);
+        let scalars = HostSlice::from_slice(&upper_scalars[..size]);

        // Setting bls12377 points and scalars
        // let points_bls12377 = &upper_points_bls12377[..size];
-        let points_bls12377 = HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
-        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());
+        let points_bls12377 = HostSlice::from_slice(&upper_points_bls12377[..size]); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostSlice::from_slice(&upper_scalars_bls12377[..size]);

        println!("Configuring bn254 MSM...");
-        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
+        let mut g2_msm_results = DeviceVec::<G2Projective>::cuda_malloc(1).unwrap();
        let stream = CudaStream::create().unwrap();
        let g2_stream = CudaStream::create().unwrap();
        let mut cfg = msm::MSMConfig::default();
@@ -82,8 +85,7 @@ fn main() {
        g2_cfg.is_async = true;

        println!("Configuring bls12377 MSM...");
-        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> =
-            HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut msm_results_bls12377 = DeviceVec::<BLS12377G1Projective>::cuda_malloc(1).unwrap();
        let stream_bls12377 = CudaStream::create().unwrap();
        let mut cfg_bls12377 = msm::MSMConfig::default();
        cfg_bls12377
@@ -94,7 +96,7 @@ fn main() {
        println!("Executing bn254 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+        msm::msm(scalars, points, &cfg, &mut msm_results[..]).unwrap();
        #[cfg(feature = "profile")]
        println!(
            "ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
@@ -102,16 +104,16 @@ fn main() {
                .elapsed()
                .as_millis()
        );
-        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+        msm::msm(scalars, g2_points, &g2_cfg, &mut g2_msm_results[..]).unwrap();

        println!("Executing bls12377 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
        msm::msm(
-            &scalars_bls12377,
-            &points_bls12377,
+            scalars_bls12377,
+            points_bls12377,
            &cfg_bls12377,
-            &mut msm_results_bls12377,
+            &mut msm_results_bls12377[..],
        )
        .unwrap();
        #[cfg(feature = "profile")]
@@ -134,10 +136,10 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results
-            .copy_to_host(&mut msm_host_result[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
            .unwrap();
        g2_msm_results
-            .copy_to_host(&mut g2_msm_host_result[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut g2_msm_host_result[..]))
            .unwrap();
        println!("bn254 result: {:#?}", msm_host_result);
        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
@@ -146,7 +148,7 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results_bls12377
-            .copy_to_host(&mut msm_host_result_bls12377[..])
+            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result_bls12377[..]))
            .unwrap();
        println!("bls12377 result: {:#?}", msm_host_result_bls12377);

@@ -154,23 +156,19 @@ fn main() {
        {
            println!("Checking against arkworks...");
            let ark_points: Vec<Bn254G1Affine> = points
-                .as_slice()
                .iter()
                .map(|&point| point.to_ark())
                .collect();
            let ark_scalars: Vec<Bn254Fr> = scalars
-                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();

            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
-                .as_slice()
                .iter()
                .map(|point| point.to_ark())
                .collect();
            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -2,7 +2,11 @@ use icicle_bn254::curve::{ScalarCfg, ScalarField};

 use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};

-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
+use icicle_cuda_runtime::{
+    device_context::DeviceContext,
+    memory::{DeviceVec, HostSlice},
+    stream::CudaStream,
+};

 use icicle_core::{
    ntt::{self, NTT},
@@ -41,14 +45,13 @@ fn main() {
    );
    // Setting Bn254 points and scalars
    println!("Generating random inputs on host for bn254...");
-    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
-    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    let scalars = ScalarCfg::generate_random(size);
+    let mut ntt_results = DeviceVec::<ScalarField>::cuda_malloc(size).unwrap();

    // Setting bls12377 points and scalars
    println!("Generating random inputs on host for bls12377...");
-    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
-    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> =
-        HostOrDeviceSlice::cuda_malloc(size).unwrap();
+    let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
+    let mut ntt_results_bls12377 = DeviceVec::<BLS12377ScalarField>::cuda_malloc(size).unwrap();

    println!("Setting up bn254 Domain...");
    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
@@ -86,7 +89,13 @@ fn main() {
    println!("Executing bn254 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
+    ntt::ntt(
+        HostSlice::from_slice(&scalars),
+        ntt::NTTDir::kForward,
+        &cfg,
+        &mut ntt_results[..],
+    )
+    .unwrap();
    #[cfg(feature = "profile")]
    println!(
        "ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
@@ -99,10 +108,10 @@ fn main() {
    #[cfg(feature = "profile")]
    let start = Instant::now();
    ntt::ntt(
-        &scalars_bls12377,
+        HostSlice::from_slice(&scalars_bls12377),
        ntt::NTTDir::kForward,
        &cfg_bls12377,
-        &mut ntt_results_bls12377,
+        &mut ntt_results_bls12377[..],
    )
    .unwrap();
    #[cfg(feature = "profile")]
@@ -119,7 +128,7 @@ fn main() {
        .unwrap();
    let mut host_bn254_results = vec![ScalarField::zero(); size];
    ntt_results
-        .copy_to_host(&mut host_bn254_results[..])
+        .copy_to_host(HostSlice::from_mut_slice(&mut host_bn254_results[..]))
        .unwrap();

    stream_bls12377
@@ -127,19 +136,17 @@ fn main() {
        .unwrap();
    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
    ntt_results_bls12377
-        .copy_to_host(&mut host_bls12377_results[..])
+        .copy_to_host(HostSlice::from_mut_slice(&mut host_bls12377_results[..]))
        .unwrap();

    println!("Checking against arkworks...");
    let mut ark_scalars: Vec<Bn254Fr> = scalars
-        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();

    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
-        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -4,7 +4,7 @@ use icicle_cuda_runtime::device_context::DeviceContext;

 use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
 use icicle_core::traits::FieldImpl;
-use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+use icicle_cuda_runtime::memory::HostSlice;

 #[cfg(feature = "profile")]
 use std::time::Instant;
@@ -25,23 +25,29 @@ fn main() {

    println!("Running Icicle Examples: Rust Poseidon Hash");
    let arity = 2u32;
-    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
+    println!(
+        "---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
+        arity
+    );
    let ctx = DeviceContext::default();
    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
    let config = PoseidonConfig::default();

-    println!("---------------------- Input size 2^{}={} ------------------------", size, test_size);
-    let inputs = vec![F::one(); test_size * arity as usize];
-    let outputs = vec![F::zero(); test_size];
-    let mut input_slice = HostOrDeviceSlice::on_host(inputs);
-    let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+    println!(
+        "---------------------- Input size 2^{}={} ------------------------",
+        size, test_size
+    );
+    let mut inputs = vec![F::one(); test_size * arity as usize];
+    let mut outputs = vec![F::zero(); test_size];
+    let input_slice = HostSlice::from_mut_slice(&mut inputs);
+    let output_slice = HostSlice::from_mut_slice(&mut outputs);

    println!("Executing BLS12-381 Poseidon Hash on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
    poseidon_hash_many::<F>(
-        &mut input_slice,
-        &mut output_slice,
+        input_slice,
+        output_slice,
        test_size as u32,
        arity as u32,
        &constants,
@@ -49,5 +55,10 @@ fn main() {
    )
    .unwrap();
    #[cfg(feature = "profile")]
-    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
-}
+    println!(
+        "ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs",
+        start
+            .elapsed()
+            .as_micros()
+    );
+}
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,152 +1,59 @@
 cmake_minimum_required(VERSION 3.18)

-# GoogleTest requires at least C++14
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-
-if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
-  message(WARNING "Note that PIC (position-independent code) is disabled.")
-else()
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-endif()
-
-# add the target cuda architectures
-# each additional architecture increases the compilation time and output file size
-if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-  find_program(_nvidia_smi "nvidia-smi")
-
-  if(_nvidia_smi)
-    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
-
-    # execute nvidia-smi -L to get a short list of GPUs available
-    exec_program(${_nvidia_smi_path} ARGS -L
-      OUTPUT_VARIABLE _nvidia_smi_out
-      RETURN_VALUE _nvidia_smi_ret)
-
-    # process the stdout of nvidia-smi
-    if(_nvidia_smi_ret EQUAL 0)
-      # convert string with newlines to list of strings
-      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-      foreach(_line ${_nvidia_smi_out})
-        if(_line MATCHES "^GPU [0-9]+:")
-          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-          # the UUID is not very useful for the user, remove it
-          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-          if(NOT _gpu_info STREQUAL "")
-            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-          endif()
-        endif()
-      endforeach()
-
-      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
-    endif()
-  endif()
-
-  # ##
-  if(DETECT_GPU_COUNT GREATER 0)
-    set(CMAKE_CUDA_ARCHITECTURES native) # do native
-  else()
-    # no GPUs found, like on Github CI runners
-    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
-  endif()
-endif()
-
 project(icicle LANGUAGES CUDA CXX)

-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-include_directories("${CMAKE_SOURCE_DIR}")
+include(cmake/Common.cmake)
+include(cmake/FieldsCommon.cmake)
+include(cmake/CurvesCommon.cmake)

+set_env()
+set_gpu_env()

-# when adding a new curve/field, append its name to the end of this list
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
-SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)
+option(DEVMODE "Enable development mode" OFF)
+option(EXT_FIELD "Build extension field" OFF)
+option(G2 "Build G2" OFF)
+option(ECNTT "Build ECNTT" OFF)
+option(BUILD_HASH "Build hash functions" OFF)
+option(BUILD_TESTS "Build unit tests" OFF)
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+# add options here

-set(IS_CURVE_SUPPORTED FALSE)
-set(I 0)
-foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
-  math(EXPR I "${I} + 1")
-  if (CURVE STREQUAL SUPPORTED_CURVE)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I}")
-    set(IS_CURVE_SUPPORTED TRUE)
-  endif ()
-endforeach()
-
-if (NOT IS_CURVE_SUPPORTED)
-  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+if((DEFINED CURVE) AND (DEFINED FIELD))
+  message( FATAL_ERROR "CURVE and FIELD cannot be defined at the same time" )
 endif ()

-if (G2_DEFINED STREQUAL "ON")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
+if (DEVMODE)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O0 --ptxas-options=-O0 --ptxas-options=-allow-expensive-optimizations=false -DDEVMODE=ON")
 endif ()

-option(BUILD_TESTS "Build tests" OFF)
-
-if (NOT BUILD_TESTS)
-
-  message(STATUS "Building without tests.")
-
-  if (CURVE IN_LIST SUPPORTED_CURVES_WITH_POSEIDON)
-    list(APPEND ICICLE_SOURCES appUtils/poseidon/poseidon.cu)
-    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
-  endif()
-
-  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
-      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
-  endif()
-
-  add_library(
-    icicle
-    utils/vec_ops.cu
-    utils/mont.cu
-    primitives/field.cu
-    primitives/projective.cu
-    appUtils/msm/msm.cu
-    ${ICICLE_SOURCES}
-  )
-  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
-  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  
-
-else()
-
-  message(STATUS "Building tests.")
-
-  include(FetchContent)
-  FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
-  )
-  # For Windows: Prevent overriding the parent project's compiler/linker settings
-
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-  FetchContent_MakeAvailable(googletest)
-
-  enable_testing()
-
-  add_executable(
-    runner
-    tests/runner.cu
-  )
-
-  target_link_libraries(
-    runner
-    GTest::gtest_main
-  )
-
-  include(GoogleTest)
-  set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-  gtest_discover_tests(runner)
-
+if(DEFINED FIELD)
+  check_field()
+  add_subdirectory(src/fields)
 endif ()
+
+if(DEFINED CURVE)
+  check_curve()
+  set(FIELD ${CURVE})
+  add_subdirectory(src/fields)
+  add_subdirectory(src/curves)
+endif ()
+
+if (G2)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
+endif ()
+
+if (EXT_FIELD)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
+endif ()
+
+if(BUILD_HASH)
+  add_subdirectory(src/hash)
+endif ()
+
+if (BUILD_TESTS)
+  add_subdirectory(tests)
+endif()
+
+if (BUILD_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
--- a/icicle/appUtils/ntt/Makefile
+++ b/icicle/appUtils/ntt/Makefile
@@ -1,6 +0,0 @@
-build_verification:
-	mkdir -p work
-	nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17
-
-test_verification: build_verification
-	work/test_verification
--- a/icicle/benchmarks/CMakeLists.txt
+++ b/icicle/benchmarks/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_executable(benches benches.cu)
+target_link_libraries(benches benchmark::benchmark)
+target_include_directories(benches PUBLIC ${CMAKE_SOURCE_DIR}/include/)
+find_package(benchmark REQUIRED)
--- a/icicle/benchmarks/README.md
+++ b/icicle/benchmarks/README.md
@@ -0,0 +1,25 @@
+# How to use benchmarks
+
+ICICLE uses [google benchmarks](https://github.com/google/benchmark) to measure the performance of primitives.
+
+To run benchmarks, make sure you have everything installed to run ICICLE (see top-level README for that). Next, you need to install google benchmarks library as described in their [documentation](https://github.com/google/benchmark?tab=readme-ov-file#installation). When running benchmarks, export the path to this installation:
+
+```
+export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:<path-to-google-benchmarks-build-folder>
+```
+
+Then to benchmark field arithmetic, say, on `baby_bear` field, run:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DFIELD=babybear -DEXT_FIELD=ON -S . -B build;
+cmake --build build;
+build/benches --benchmark_counters_tabular=true
+```
+
+`-U` parameters are needed to clear variables from previous runs and `EXT_FIELD` can be disabled if benhcmarking the extension field is not needed. To benchmark a curve, say, `bn254`, change the first `cmake` call to:
+
+```
+cmake -UCURVE -UFIELD -UG2 -UEXT_FIELD -DCURVE=bn254 -S . -B build;
+```
+
+Benchmarks measure throughput of very cheap operations like field multiplication or EC addition by repeating them very many times in parallel, so throughput is the main metric to look at.
--- a/icicle/benchmarks/benches.cu
+++ b/icicle/benchmarks/benches.cu
@@ -0,0 +1,6 @@
+#include "field_benchmarks.cu"
+#ifdef CURVE_ID
+#include "curve_benchmarks.cu"
+#endif
+
+BENCHMARK_MAIN();
--- a/icicle/benchmarks/curve_benchmarks.cu
+++ b/icicle/benchmarks/curve_benchmarks.cu
@@ -0,0 +1,79 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "curves/curve_config.cuh"
+
+using namespace curve_config;
+using namespace benchmark;
+
+static void BM_MixedECAdd(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  affine_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(affine_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  affine_t* h_points2 = (affine_t*)malloc(n * sizeof(affine_t));
+  projective_t::RandHostMany(h_points1, n);
+  projective_t::RandHostManyAffine(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(affine_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, affine_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+static void BM_FullECAdd(benchmark::State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  projective_t* points1;
+  projective_t* points2;
+  assert(!cudaMalloc(&points1, n * sizeof(projective_t)));
+  assert(!cudaMalloc(&points2, n * sizeof(projective_t)));
+
+  projective_t* h_points1 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t* h_points2 = (projective_t*)malloc(n * sizeof(projective_t));
+  projective_t::RandHostMany(h_points1, n);
+  projective_t::RandHostMany(h_points2, n);
+  cudaMemcpy(points1, h_points1, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+  cudaMemcpy(points2, h_points2, sizeof(projective_t) * n, cudaMemcpyHostToDevice);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<projective_t, projective_t, N>(points1, points2, points1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(points1);
+  cudaFree(points2);
+}
+
+BENCHMARK(BM_FullECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_MixedECAdd)->Range(1 << 27, 1 << 27)->Unit(benchmark::kMillisecond);
--- a/icicle/benchmarks/field_benchmarks.cu
+++ b/icicle/benchmarks/field_benchmarks.cu
@@ -0,0 +1,108 @@
+#include <benchmark/benchmark.h>
+#include "utils/test_functions.cuh"
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+using namespace benchmark;
+
+template <class T>
+static void BM_FieldAdd(State& state)
+{
+  constexpr int N = 256;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_add<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldMul(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars1;
+  T* scalars2;
+  assert(!cudaMalloc(&scalars1, n * sizeof(T)));
+  assert(!cudaMalloc(&scalars2, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars1, n) == cudaSuccess);
+  assert(device_populate_random<T>(scalars2, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((vec_mul<T, T, N>(scalars1, scalars2, scalars1, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars1);
+  cudaFree(scalars2);
+}
+
+template <class T>
+static void BM_FieldSqr(State& state)
+{
+  constexpr int N = 128;
+  int n = state.range(0) / N;
+  T* scalars;
+  assert(!cudaMalloc(&scalars, n * sizeof(T)));
+
+  assert(device_populate_random<T>(scalars, n) == cudaSuccess);
+
+  for (auto _ : state) {
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    assert((field_vec_sqr<T, N>(scalars, scalars, n)) == cudaSuccess);
+    assert(cudaStreamSynchronize(0) == cudaSuccess);
+    cudaEventRecord(stop);
+
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+
+    state.SetIterationTime((double)(milliseconds / 1000));
+  }
+  state.counters["Throughput"] = Counter(state.range(0), Counter::kIsRate | Counter::kIsIterationInvariant);
+  cudaFree(scalars);
+}
+
+BENCHMARK(BM_FieldAdd<scalar_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<scalar_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+
+#ifdef EXT_FIELD
+BENCHMARK(BM_FieldAdd<extension_t>)->Range(1 << 28, 1 << 28)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldMul<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+BENCHMARK(BM_FieldSqr<extension_t>)->Range(1 << 27, 1 << 27)->Unit(kMicrosecond);
+#endif
--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -0,0 +1,72 @@
+function(set_env)
+    set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+    set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+
+    if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+        message(WARNING "Note that PIC (position-independent code) is disabled.")
+    else()
+        set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    endif()
+endfunction()
+
+function(set_gpu_env)
+    # add the target cuda architectures
+    # each additional architecture increases the compilation time and output file size
+    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    else()
+    find_program(_nvidia_smi "nvidia-smi")
+
+    if(_nvidia_smi)
+        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+        # execute nvidia-smi -L to get a short list of GPUs available
+        exec_program(${_nvidia_smi_path} ARGS -L
+        OUTPUT_VARIABLE _nvidia_smi_out
+        RETURN_VALUE _nvidia_smi_ret)
+
+        # process the stdout of nvidia-smi
+        if(_nvidia_smi_ret EQUAL 0)
+        # convert string with newlines to list of strings
+        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+        foreach(_line ${_nvidia_smi_out})
+            if(_line MATCHES "^GPU [0-9]+:")
+            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+            # the UUID is not very useful for the user, remove it
+            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+            if(NOT _gpu_info STREQUAL "")
+                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+            endif()
+            endif()
+        endforeach()
+
+        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+        endif()
+    endif()
+
+    # ##
+    if(DETECT_GPU_COUNT GREATER 0)
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
+    else()
+        # no GPUs found, like on Github CI runners
+        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
+    endif()
+    endif()
+
+    # Check CUDA version and, if possible, enable multi-threaded compilation 
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
+        message(STATUS "Using multi-threaded CUDA compilation.")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
+    else()
+        message(STATUS "Can't use multi-threaded CUDA compilation.")
+    endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
+endfunction()
--- a/icicle/cmake/CurvesCommon.cmake
+++ b/icicle/cmake/CurvesCommon.cmake
@@ -0,0 +1,17 @@
+function(check_curve)
+  set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+
+  set(IS_CURVE_SUPPORTED FALSE)
+  set(I 0)
+  foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
+    math(EXPR I "${I} + 1")
+    if (CURVE STREQUAL SUPPORTED_CURVE)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_CURVE_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_CURVE_SUPPORTED)
+    message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+  endif ()
+endfunction()
--- a/icicle/cmake/FieldsCommon.cmake
+++ b/icicle/cmake/FieldsCommon.cmake
@@ -0,0 +1,17 @@
+function(check_field)
+  set(SUPPORTED_FIELDS babybear)
+
+  set(IS_FIELD_SUPPORTED FALSE)
+  set(I 1000)
+  foreach (SUPPORTED_FIELD ${SUPPORTED_FIELDS})
+    math(EXPR I "${I} + 1")
+    if (FIELD STREQUAL SUPPORTED_FIELD)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DFIELD_ID=${I}" PARENT_SCOPE)
+      set(IS_FIELD_SUPPORTED TRUE)
+    endif ()
+  endforeach()
+
+  if (NOT IS_FIELD_SUPPORTED)
+    message( FATAL_ERROR "The value of FIELD variable: ${FIELD} is not one of the supported fields: ${SUPPORTED_FIELDS}" )
+  endif ()
+endfunction()
--- a/icicle/include/curves/affine.cuh
+++ b/icicle/include/curves/affine.cuh
@@ -1,6 +1,8 @@
 #pragma once

-#include "field.cuh"
+#include "gpu-utils/sharedmem.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include <iostream>

 template <class FF>
 class Affine
@@ -34,3 +36,12 @@ public:
    return os;
  }
 };
+
+template <class FF>
+struct SharedMemory<Affine<FF>> {
+  __device__ Affine<FF>* getPointer()
+  {
+    extern __shared__ Affine<FF> s_affine_[];
+    return s_affine_;
+  }
+};
--- a/icicle/include/curves/curve_config.cuh
+++ b/icicle/include/curves/curve_config.cuh
@@ -1,37 +1,38 @@
 #pragma once
-#ifndef INDEX_H
-#define INDEX_H
+#ifndef CURVE_CONFIG_H
+#define CURVE_CONFIG_H

-#define BN254     1
-#define BLS12_381 2
-#define BLS12_377 3
-#define BW6_761   4
-#define GRUMPKIN  5
-
-#include "../primitives/field.cuh"
-#include "../primitives/projective.cuh"
-#if defined(G2_DEFINED)
-#include "../primitives/extension_field.cuh"
-#endif
+#include "fields/id.h"
+#include "curves/projective.cuh"

 #if CURVE_ID == BN254
-#include "bn254_params.cuh"
+#include "curves/params/bn254.cuh"
 using namespace bn254;
+
 #elif CURVE_ID == BLS12_381
-#include "bls12_381_params.cuh"
+#include "curves/params/bls12_381.cuh"
 using namespace bls12_381;
+
 #elif CURVE_ID == BLS12_377
-#include "bls12_377_params.cuh"
+#include "curves/params/bls12_377.cuh"
 using namespace bls12_377;
+
 #elif CURVE_ID == BW6_761
-#include "bls12_377_params.cuh"
-#include "bw6_761_params.cuh"
+#include "curves/params/bw6_761.cuh"
 using namespace bw6_761;
+
 #elif CURVE_ID == GRUMPKIN
-#include "grumpkin_params.cuh"
+#include "curves/params/grumpkin.cuh"
 using namespace grumpkin;
 #endif

+#include "fields/field_config.cuh"
+using field_config::scalar_t;
+
+#ifdef G2
+#include "fields/quadratic_extension.cuh"
+#endif
+
 /**
 * @namespace curve_config
 * Namespace with type definitions for short Weierstrass pairing-friendly [elliptic
@@ -39,18 +40,11 @@ using namespace grumpkin;
 * with the `-DCURVE` env variable passed during build.
 */
 namespace curve_config {
-
-#if CURVE_ID == BW6_761
-  typedef bls12_377::fq_config fp_config;
-#endif
-  /**
-   * Scalar field of the curve. Is always a prime field.
-   */
-  typedef Field<fp_config> scalar_t;
  /**
   * Base field of G1 curve. Is always a prime field.
   */
  typedef Field<fq_config> point_field_t;
+
  static constexpr point_field_t generator_x = point_field_t{g1_gen_x};
  static constexpr point_field_t generator_y = point_field_t{g1_gen_y};
  static constexpr point_field_t b = point_field_t{weierstrass_b};
@@ -64,7 +58,7 @@ namespace curve_config {
   */
  typedef Affine<point_field_t> affine_t;

-#if defined(G2_DEFINED)
+#ifdef G2
 #if CURVE_ID == BW6_761
  typedef point_field_t g2_point_field_t;
  static constexpr g2_point_field_t g2_generator_x = g2_point_field_t{g2_gen_x};
@@ -79,6 +73,7 @@ namespace curve_config {
  static constexpr g2_point_field_t g2_b =
    g2_point_field_t{point_field_t{weierstrass_b_g2_re}, point_field_t{weierstrass_b_g2_im}};
 #endif
+
  /**
   * [Projective representation](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) of G2 curve.
   */
--- a/icicle/include/curves/params/bls12_377.cuh
+++ b/icicle/include/curves/params/bls12_377.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#ifndef BLS12_377_PARAMS_H
+#define BLS12_377_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bls12_377_base.cuh"
+
+namespace bls12_377 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
+                                                               0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
+                                                               0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
+                                                               0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
+                                                               0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
+                                                                  0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
+                                                                  0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
+                                                                  0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
+                                                                  0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
+                                                                  0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
+                                                                  0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
+                                                                  0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
+                                                                  0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
+    0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
+} // namespace bls12_377
+
+#endif
--- a/icicle/include/curves/params/bls12_381.cuh
+++ b/icicle/include/curves/params/bls12_381.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#ifndef BLS12_381_PARAMS_H
+#define BLS12_381_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bls12_381_base.cuh"
+
+namespace bls12_381 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
+                                                               0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
+                                                               0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
+                                                               0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
+                                                               0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
+                                                                  0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
+                                                                  0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
+                                                                  0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
+                                                                  0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
+                                                                  0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
+                                                                  0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
+                                                                  0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
+                                                                  0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace bls12_381
+
+#endif
--- a/icicle/include/curves/params/bn254.cuh
+++ b/icicle/include/curves/params/bn254.cuh
@@ -0,0 +1,31 @@
+#pragma once
+#ifndef BN254_PARAMS_H
+#define BN254_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bn254_base.cuh"
+
+namespace bn254 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
+                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
+                                                                  0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
+                                                                  0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
+                                                                  0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
+                                                                  0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
+                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
+    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
+  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
+    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
+} // namespace bn254
+
+#endif
--- a/icicle/include/curves/params/bw6_761.cuh
+++ b/icicle/include/curves/params/bw6_761.cuh
@@ -0,0 +1,37 @@
+#pragma once
+#ifndef BW6_761_PARAMS_H
+#define BW6_761_PARAMS_H
+
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bw6_761_base.cuh"
+
+namespace bw6_761 {
+  // G1 and G2 generators
+  static constexpr storage<fq_config::limbs_count> g1_gen_x = {
+    0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a,
+    0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa,
+    0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02};
+  static constexpr storage<fq_config::limbs_count> g1_gen_y = {
+    0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141,
+    0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1,
+    0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
+  static constexpr storage<fq_config::limbs_count> g2_gen_x = {
+    0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb,
+    0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d,
+    0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332};
+  static constexpr storage<fq_config::limbs_count> g2_gen_y = {
+    0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd,
+    0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8,
+    0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335};
+
+  static constexpr storage<fq_config::limbs_count> weierstrass_b = {
+    0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+    0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+    0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+  static constexpr storage<fq_config::limbs_count> g2_weierstrass_b = {
+    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+} // namespace bw6_761
+
+#endif
--- a/icicle/include/curves/params/grumpkin.cuh
+++ b/icicle/include/curves/params/grumpkin.cuh
@@ -2,13 +2,11 @@
 #ifndef GRUMPKIN_PARAMS_H
 #define GRUMPKIN_PARAMS_H

-#include "../utils/storage.cuh"
-#include "bn254_params.cuh"
+#include "fields/storage.cuh"
+#include "fields/snark_fields/bn254_scalar.cuh"

 namespace grumpkin {
-  typedef bn254::fq_config fp_config;
  typedef bn254::fp_config fq_config;
-
  // G1 generator
  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -1,6 +1,7 @@
 #pragma once

 #include "affine.cuh"
+#include "gpu-utils/sharedmem.cuh"

 template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
 class Projective
@@ -8,6 +9,12 @@ class Projective
  friend Affine<FF>;

 public:
+  typedef Affine<FF> Aff;
+  typedef SCALAR_FF Scalar;
+
+  static constexpr unsigned SCALAR_FF_NBITS = SCALAR_FF::NBITS;
+  static constexpr unsigned FF_NBITS = FF::NBITS;
+
  FF x;
  FF y;
  FF z;
@@ -20,7 +27,10 @@ public:
    return {point.x * denom, point.y * denom};
  }

-  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point) { return {point.x, point.y, FF::one()}; }
+  static HOST_DEVICE_INLINE Projective from_affine(const Affine<FF>& point)
+  {
+    return point == Affine<FF>::zero() ? zero() : Projective{point.x, point.y, FF::one()};
+  }

  static HOST_DEVICE_INLINE Projective ToMontgomery(const Projective& point)
  {
@@ -36,6 +46,34 @@ public:

  static HOST_DEVICE_INLINE Projective neg(const Projective& point) { return {point.x, FF::neg(point.y), point.z}; }

+  static HOST_DEVICE_INLINE Projective dbl(const Projective& point)
+  {
+    const FF X = point.x;
+    const FF Y = point.y;
+    const FF Z = point.z;
+
+    // TODO: Change to efficient dbl once implemented for field.cuh
+    FF t0 = FF::sqr(Y);                                                     // 1. t0 ← Y · Y
+    FF Z3 = t0 + t0;                                                        // 2. Z3 ← t0 + t0
+    Z3 = Z3 + Z3;                                                           // 3. Z3 ← Z3 + Z3
+    Z3 = Z3 + Z3;                                                           // 4. Z3 ← Z3 + Z3
+    FF t1 = Y * Z;                                                          // 5. t1 ← Y · Z
+    FF t2 = FF::sqr(Z);                                                     // 6. t2 ← Z · Z
+    t2 = FF::template mul_unsigned<3>(FF::template mul_const<B_VALUE>(t2)); // 7. t2 ← b3 · t2
+    FF X3 = t2 * Z3;                                                        // 8. X3 ← t2 · Z3
+    FF Y3 = t0 + t2;                                                        // 9. Y3 ← t0 + t2
+    Z3 = t1 * Z3;                                                           // 10. Z3 ← t1 · Z3
+    t1 = t2 + t2;                                                           // 11. t1 ← t2 + t2
+    t2 = t1 + t2;                                                           // 12. t2 ← t1 + t2
+    t0 = t0 - t2;                                                           // 13. t0 ← t0 − t2
+    Y3 = t0 * Y3;                                                           // 14. Y3 ← t0 · Y3
+    Y3 = X3 + Y3;                                                           // 15. Y3 ← X3 + Y3
+    t1 = X * Y;                                                             // 16. t1 ← X · Y
+    X3 = t0 * t1;                                                           // 17. X3 ← t0 · t1
+    X3 = X3 + X3;                                                           // 18. X3 ← X3 + X3
+    return {X3, Y3, Z3};
+  }
+
  friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2)
  {
    const FF X1 = p1.x;                                                                //                   < 2
@@ -134,7 +172,7 @@ public:
  {
    Projective res = zero();
 #ifdef __CUDA_ARCH__
-#pragma unroll
+    UNROLL
 #endif
    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
      if (i > 0) { res = res + res; }
@@ -190,3 +228,12 @@ public:
      out[i] = (i % size < 100) ? to_affine(rand_host()) : out[i - 100];
  }
 };
+
+template <typename FF, class SCALAR_FF, const FF& B_VALUE, const FF& GENERATOR_X, const FF& GENERATOR_Y>
+struct SharedMemory<Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>> {
+  __device__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y>* getPointer()
+  {
+    extern __shared__ Projective<FF, SCALAR_FF, B_VALUE, GENERATOR_X, GENERATOR_Y> s_projective_[];
+    return s_projective_;
+  }
+};
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -18,20 +18,19 @@

 #pragma once

-#include "../utils/error_handler.cuh"
-#include "../utils/host_math.cuh"
-#include "../utils/ptx.cuh"
-#include "../utils/storage.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
+#include "host_math.cuh"
+#include "ptx.cuh"
+#include "storage.cuh"
+
 #include <iomanip>
 #include <iostream>
 #include <random>
 #include <sstream>
 #include <string>

-#define HOST_INLINE        __host__ __forceinline__
-#define DEVICE_INLINE      __device__ __forceinline__
-#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
-
 template <class CONFIG>
 class Field
 {
@@ -130,7 +129,7 @@ public:
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-#pragma unroll
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
@@ -141,7 +140,7 @@ public:
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-#pragma unroll
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++)
        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
@@ -152,7 +151,7 @@ public:
    {
      Field out{};
 #ifdef __CUDA_ARCH__
-#pragma unroll
+      UNROLL
 #endif
      for (unsigned i = 0; i < TLC; i++) {
 #ifdef __CUDA_ARCH__
@@ -244,14 +243,14 @@ public:
  }

  template <bool SUBTRACT, bool CARRY_OUT>
-  static constexpr __device__ __forceinline__ uint32_t
+  static constexpr DEVICE_INLINE uint32_t
  add_sub_u32_device(const uint32_t* x, const uint32_t* y, uint32_t* r, size_t n = (TLC >> 1))
  {
    r[0] = SUBTRACT ? ptx::sub_cc(x[0], y[0]) : ptx::add_cc(x[0], y[0]);
-    for (unsigned i = 1; i < (CARRY_OUT ? n : n - 1); i++)
+    for (unsigned i = 1; i < n; i++)
      r[i] = SUBTRACT ? ptx::subc_cc(x[i], y[i]) : ptx::addc_cc(x[i], y[i]);
    if (!CARRY_OUT) {
-      r[n - 1] = SUBTRACT ? ptx::subc(x[n - 1], y[n - 1]) : ptx::addc(x[n - 1], y[n - 1]);
+      ptx::addc(0, 0);
      return 0;
    }
    return SUBTRACT ? ptx::subc(0, 0) : ptx::addc(0, 0);
@@ -327,7 +326,7 @@ public:

  static DEVICE_INLINE void mul_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
-#pragma unroll
+    UNROLL
    for (size_t i = 0; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -336,7 +335,7 @@ public:

  static DEVICE_INLINE void mul_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, size_t start_i = 0)
  {
-#pragma unroll
+    UNROLL
    for (size_t i = start_i; i < n; i += 2) {
      acc[i] = ptx::mul_lo(a[i], bi);
      acc[i + 1] = ptx::mul_hi(a[i], bi);
@@ -344,14 +343,14 @@ public:
  }

  template <bool CARRY_IN = false>
-  static __device__ __forceinline__ void
+  static DEVICE_INLINE void
  cmad_n(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC, uint32_t optional_carry = 0)
  {
    if (CARRY_IN) ptx::add_cc(UINT32_MAX, optional_carry);
    acc[0] = CARRY_IN ? ptx::madc_lo_cc(a[0], bi, acc[0]) : ptx::mad_lo_cc(a[0], bi, acc[0]);
    acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);

-#pragma unroll
+    UNROLL
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
@@ -359,7 +358,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static __device__ __forceinline__ void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void cmad_n_msb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (EVEN_PHASE) {
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -368,14 +367,14 @@ public:
      acc[1] = ptx::mad_hi_cc(a[0], bi, acc[1]);
    }

-#pragma unroll
+    UNROLL
    for (size_t i = 2; i < n; i += 2) {
      acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
      acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
    }
  }

-  static __device__ __forceinline__ void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void cmad_n_lsb(uint32_t* acc, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    if (n > 1)
      acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
@@ -383,7 +382,7 @@ public:
      acc[0] = ptx::mad_lo(a[0], bi, acc[0]);

    size_t i;
-#pragma unroll
+    UNROLL
    for (i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, acc[i]);
      if (i == n - 2)
@@ -395,7 +394,7 @@ public:
  }

  template <bool CARRY_OUT = false, bool CARRY_IN = false>
-  static __device__ __forceinline__ uint32_t mad_row(
+  static DEVICE_INLINE uint32_t mad_row(
    uint32_t* odd,
    uint32_t* even,
    const uint32_t* a,
@@ -420,8 +419,7 @@ public:
  }

  template <bool EVEN_PHASE>
-  static __device__ __forceinline__ void
-  mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void mad_row_msb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    cmad_n_msb<!EVEN_PHASE>(odd, EVEN_PHASE ? a : (a + 1), bi, n - 2);
    odd[EVEN_PHASE ? (n - 1) : (n - 2)] = ptx::madc_lo_cc(a[n - 1], bi, 0);
@@ -430,8 +428,7 @@ public:
    odd[EVEN_PHASE ? n : (n - 1)] = ptx::addc(odd[EVEN_PHASE ? n : (n - 1)], 0);
  }

-  static __device__ __forceinline__ void
-  mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
+  static DEVICE_INLINE void mad_row_lsb(uint32_t* odd, uint32_t* even, const uint32_t* a, uint32_t bi, size_t n = TLC)
  {
    // bi here is constant so we can do a compile-time check for zero (which does happen once for bls12-381 scalar field
    // modulus)
@@ -442,12 +439,12 @@ public:
    return;
  }

-  static __device__ __forceinline__ uint32_t
+  static DEVICE_INLINE uint32_t
  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
  {
    acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);

-#pragma unroll
+    UNROLL
    for (size_t i = 1; i < n - 1; i += 2) {
      acc[i] = ptx::madc_hi_cc(a[i - 1], bi, extra[i]);
      acc[i + 1] = ptx::madc_lo_cc(a[i + 1], bi, extra[i + 1]);
@@ -470,70 +467,86 @@ public:
   * \cdot b_0}{2^{32}}} + \dots + \floor{\frac{a_0 \cdot b_{TLC - 2}}{2^{32}}}) \leq 2^{64} + 2\cdot 2^{96} + \dots +
   * (TLC - 2) \cdot 2^{32(TLC - 1)} + (TLC - 1) \cdot 2^{32(TLC - 1)} \leq 2(TLC - 1) \cdot 2^{32(TLC - 1)}\f$.
   */
-  static __device__ __forceinline__ void
-  multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
+  static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
  {
-    const uint32_t* a = as.limbs;
-    const uint32_t* b = bs.limbs;
-    uint32_t* even = rs.limbs;
-    __align__(16) uint32_t odd[2 * TLC - 2];
+    if constexpr (TLC > 1) {
+      const uint32_t* a = as.limbs;
+      const uint32_t* b = bs.limbs;
+      uint32_t* even = rs.limbs;
+      __align__(16) uint32_t odd[2 * TLC - 2];

-    even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
-    odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
-    odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
-    size_t i;
-#pragma unroll
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
-      mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
+      even[TLC - 1] = ptx::mul_hi(a[TLC - 2], b[0]);
+      odd[TLC - 2] = ptx::mul_lo(a[TLC - 1], b[0]);
+      odd[TLC - 1] = ptx::mul_hi(a[TLC - 1], b[0]);
+      size_t i;
+      UNROLL
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row_msb<true>(&even[TLC - 2], &odd[TLC - 2], &a[TLC - i - 1], b[i - 1], i + 1);
+        mad_row_msb<false>(&odd[TLC - 2], &even[TLC - 2], &a[TLC - i - 2], b[i], i + 2);
+      }
+      mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]);
+
+      // merge |even| and |odd|
+      ptx::add_cc(even[TLC - 1], odd[TLC - 2]);
+      for (i = TLC - 1; i < 2 * TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    } else {
+      multiply_raw_device(as, bs, rs);
    }
-    mad_row(&even[TLC], &odd[TLC - 2], a, b[TLC - 1]);
-
-    // merge |even| and |odd|
-    ptx::add_cc(even[TLC - 1], odd[TLC - 2]);
-    for (i = TLC - 1; i < 2 * TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], 0);
  }

  /**
-   * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$.
+   * A function that computes the low half of the fused multiply-and-add \f$ rs = as \cdot bs + cs \f$ where
+   * \f$ bs = 2^{32*nof_limbs} \f$.
   *
   * For efficiency, this method does not include terms that are too large. Namely, limb product \f$ a_i \cdot b_j \f$
   * is excluded if \f$ i + j > TLC - 1 \f$ and only the lower half is included if \f$ i + j = TLC - 1 \f$. All other
   * limb products are included.
   */
-  static __device__ __forceinline__ void
-  multiply_and_add_lsb_raw_device(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
+  static DEVICE_INLINE void
+  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
+    ff_storage bs = get_neg_modulus();
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
+    uint32_t* c = cs.limbs;
    uint32_t* even = rs.limbs;
-    __align__(16) uint32_t odd[TLC - 1];
-    size_t i;
-    // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
-    // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
-    // less efficient SASS generated by nvcc, so this case needed separate handling.
-    if (b[0] == UINT32_MAX) {
-      add_sub_u32_device<true, false>(cs.limbs, a, even, TLC);
-      for (i = 0; i < TLC - 1; i++)
-        odd[i] = a[i];
-    } else {
-      mul_n_and_add(even, a, b[0], cs.limbs, TLC);
-      mul_n(odd, a + 1, b[0], TLC - 1);
-    }
-    mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
-#pragma unroll
-    for (i = 2; i < TLC - 1; i += 2) {
-      mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
-      mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
-    }

-    // merge |even| and |odd|
-    even[1] = ptx::add_cc(even[1], odd[0]);
-    for (i = 1; i < TLC - 2; i++)
-      even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
-    even[i + 1] = ptx::addc(even[i + 1], odd[i]);
+    if constexpr (TLC > 2) {
+      __align__(16) uint32_t odd[TLC - 1];
+      size_t i;
+      // `b[0]` is \f$ 2^{32} \f$ minus the last limb of prime modulus. Because most scalar (and some base) primes
+      // are necessarily NTT-friendly, `b[0]` often turns out to be \f$ 2^{32} - 1 \f$. This actually leads to
+      // less efficient SASS generated by nvcc, so this case needed separate handling.
+      if (b[0] == UINT32_MAX) {
+        add_sub_u32_device<true, false>(c, a, even, TLC);
+        for (i = 0; i < TLC - 1; i++)
+          odd[i] = a[i];
+      } else {
+        mul_n_and_add(even, a, b[0], c, TLC);
+        mul_n(odd, a + 1, b[0], TLC - 1);
+      }
+      mad_row_lsb(&even[2], &odd[0], a, b[1], TLC - 1);
+      UNROLL
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row_lsb(&odd[i], &even[i], a, b[i], TLC - i);
+        mad_row_lsb(&even[i + 2], &odd[i], a, b[i + 1], TLC - i - 1);
+      }
+
+      // merge |even| and |odd|
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      for (i = 1; i < TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], odd[i]);
+    } else if (TLC == 2) {
+      even[0] = ptx::mad_lo(a[0], b[0], c[0]);
+      even[1] = ptx::mad_hi(a[0], b[0], c[0]);
+      even[1] = ptx::mad_lo(a[0], b[1], even[1]);
+      even[1] = ptx::mad_lo(a[1], b[0], even[1]);
+    } else if (TLC == 1) {
+      even[0] = ptx::mad_lo(a[0], b[0], c[0]);
+    }
  }

  /**
@@ -545,7 +558,7 @@ public:
   * that the top bit of \f$ a_{hi} \f$ and \f$ b_{hi} \f$ are unset. This ensures correctness by allowing to keep the
   * result inside TLC limbs and ignore the carries from the highest limb.
   */
-  static __device__ __forceinline__ void
+  static DEVICE_INLINE void
  multiply_and_add_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even, uint32_t* in1, uint32_t* in2)
  {
    __align__(16) uint32_t odd[TLC - 2];
@@ -553,7 +566,7 @@ public:
    uint32_t carry = mul_n_and_add(odd, a + 1, b[0], &in2[1]);

    size_t i;
-#pragma unroll
+    UNROLL
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      carry = mad_row<true, false>(
        &even[i], &odd[i - 2], a, b[i - 1], TLC >> 1, in1[(TLC >> 1) + i - 2], in1[(TLC >> 1) + i - 1], carry);
@@ -574,7 +587,7 @@ public:
   * This method multiplies `a` and `b` and writes the result into `even`. It assumes that `a` and `b` are TLC/2 limbs
   * long. The usual schoolbook algorithm is used.
   */
-  static __device__ __forceinline__ void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
+  static DEVICE_INLINE void multiply_short_raw_device(const uint32_t* a, const uint32_t* b, uint32_t* even)
  {
    __align__(16) uint32_t odd[TLC - 2];
    mul_n(even, a, b[0], TLC >> 1);
@@ -582,7 +595,7 @@ public:
    mad_row(&even[2], &odd[0], a, b[1], TLC >> 1);

    size_t i;
-#pragma unroll
+    UNROLL
    for (i = 2; i < ((TLC >> 1) - 1); i += 2) {
      mad_row(&odd[i], &even[i], a, b[i], TLC >> 1);
      mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC >> 1);
@@ -606,29 +619,47 @@ public:
    const uint32_t* a = as.limbs;
    const uint32_t* b = bs.limbs;
    uint32_t* r = rs.limbs;
-    // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and
-    // write the results into `r`.
-    multiply_short_raw_device(a, b, r);
-    multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
-    __align__(16) uint32_t middle_part[TLC];
-    __align__(16) uint32_t diffs[TLC];
-    // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
-    // `carry1` and `carry2`.
-    uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
-    uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
-    // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
-    // This is where the assumption about unset high bit of `a` and `b` is relevant.
-    multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
-    // Corrections that need to be performed when differences are negative.
-    // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
-    if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
-    if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
-    // Now that middle part is fully correct, it can be added to the result.
-    add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);
+    if constexpr (TLC > 2) {
+      // Next two lines multiply high and low halves of operands (\f$ a_{lo} \cdot b_{lo}; a_{hi} \cdot b_{hi} \$f) and
+      // write the results into `r`.
+      multiply_short_raw_device(a, b, r);
+      multiply_short_raw_device(&a[TLC >> 1], &b[TLC >> 1], &r[TLC]);
+      __align__(16) uint32_t middle_part[TLC];
+      __align__(16) uint32_t diffs[TLC];
+      // Differences of halves \f$ a_{hi} - a_{lo}; b_{lo} - b_{hi} \$f are written into `diffs`, signs written to
+      // `carry1` and `carry2`.
+      uint32_t carry1 = add_sub_u32_device<true, true>(&a[TLC >> 1], a, diffs);
+      uint32_t carry2 = add_sub_u32_device<true, true>(b, &b[TLC >> 1], &diffs[TLC >> 1]);
+      // Compute the "middle part" of Karatsuba: \f$ a_{lo} \cdot b_{hi} + b_{lo} \cdot a_{hi} \f$.
+      // This is where the assumption about unset high bit of `a` and `b` is relevant.
+      multiply_and_add_short_raw_device(diffs, &diffs[TLC >> 1], middle_part, r, &r[TLC]);
+      // Corrections that need to be performed when differences are negative.
+      // Again, carry doesn't need to be propagated due to unset high bits of `a` and `b`.
+      if (carry1) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], &diffs[TLC >> 1], &middle_part[TLC >> 1]);
+      if (carry2) add_sub_u32_device<true, false>(&middle_part[TLC >> 1], diffs, &middle_part[TLC >> 1]);
+      // Now that middle part is fully correct, it can be added to the result.
+      add_sub_u32_device<false, true>(&r[TLC >> 1], middle_part, &r[TLC >> 1], TLC);

-    // Carry from adding middle part has to be propagated to the highest limb.
-    for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
-      r[i] = ptx::addc_cc(r[i], 0);
+      // Carry from adding middle part has to be propagated to the highest limb.
+      for (size_t i = TLC + (TLC >> 1); i < 2 * TLC; i++)
+        r[i] = ptx::addc_cc(r[i], 0);
+    } else if (TLC == 2) {
+      __align__(8) uint32_t odd[2];
+      r[0] = ptx::mul_lo(a[0], b[0]);
+      r[1] = ptx::mul_hi(a[0], b[0]);
+      r[2] = ptx::mul_lo(a[1], b[1]);
+      r[3] = ptx::mul_hi(a[1], b[1]);
+      odd[0] = ptx::mul_lo(a[0], b[1]);
+      odd[1] = ptx::mul_hi(a[0], b[1]);
+      odd[0] = ptx::mad_lo(a[1], b[0], odd[0]);
+      odd[1] = ptx::mad_hi(a[1], b[0], odd[1]);
+      r[1] = ptx::add_cc(r[1], odd[0]);
+      r[2] = ptx::addc_cc(r[2], odd[1]);
+      r[3] = ptx::addc(r[3], 0);
+    } else if (TLC == 1) {
+      r[0] = ptx::mul_lo(a[0], b[0]);
+      r[1] = ptx::mul_hi(a[0], b[0]);
+    }
  }

  static HOST_INLINE void multiply_raw_host(const ff_storage& as, const ff_storage& bs, ff_wide_storage& rs)
@@ -654,13 +685,13 @@ public:
  }

  static HOST_DEVICE_INLINE void
-  multiply_and_add_lsb_raw(const ff_storage& as, const ff_storage& bs, ff_storage& cs, ff_storage& rs)
+  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
  {
 #ifdef __CUDA_ARCH__
-    return multiply_and_add_lsb_raw_device(as, bs, cs, rs);
+    return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
 #else
    Wide r_wide = {};
-    multiply_raw_host(as, bs, r_wide.limbs_storage);
+    multiply_raw_host(as, get_neg_modulus(), r_wide.limbs_storage);
    Field r = Wide::get_lower(r_wide);
    add_limbs<false>(cs, r.limbs_storage, rs);
 #endif
@@ -680,7 +711,7 @@ public:

  HOST_DEVICE_INLINE uint32_t* export_limbs() { return (uint32_t*)limbs_storage.limbs; }

-  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
  {
    const uint32_t limb_lsb_idx = (digit_num * digit_width) / 32;
    const uint32_t shift_bits = (digit_num * digit_width) % 32;
@@ -791,7 +822,7 @@ public:
    Field xs_lo = Wide::get_lower(xs);
    // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
    // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
-    multiply_and_add_lsb_raw(l_hi.limbs_storage, get_neg_modulus(), xs_lo.limbs_storage, r.limbs_storage);
+    multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage);
    ff_storage r_reduced = {};
    uint32_t carry;
    // As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
@@ -817,7 +848,7 @@ public:
    const uint32_t* x = xs.limbs_storage.limbs;
    const uint32_t* y = ys.limbs_storage.limbs;
    uint32_t limbs_or = x[0] ^ y[0];
-#pragma unroll
+    UNROLL
    for (unsigned i = 1; i < TLC; i++)
      limbs_or |= x[i] ^ y[i];
    return limbs_or == 0;
@@ -836,7 +867,7 @@ public:
    Field mul = multiplier;
    static bool is_u32 = true;
 #ifdef __CUDA_ARCH__
-#pragma unroll
+    UNROLL
 #endif
    for (unsigned i = 1; i < TLC; i++)
      is_u32 &= (mul.limbs_storage.limbs[i] == 0);
@@ -852,7 +883,7 @@ public:
    T temp = xs;
    bool is_zero = true;
 #ifdef __CUDA_ARCH__
-#pragma unroll
+    UNROLL
 #endif
    for (unsigned i = 0; i < 32; i++) {
      if (multiplier & (1 << i)) {
@@ -895,21 +926,24 @@ public:
    return rs;
  }

+  // Assumes the number is even!
  template <unsigned MODULUS_MULTIPLE = 1>
  static constexpr HOST_DEVICE_INLINE Field div2(const Field& xs)
  {
    const uint32_t* x = xs.limbs_storage.limbs;
    Field rs = {};
    uint32_t* r = rs.limbs_storage.limbs;
+    if constexpr (TLC > 1) {
 #ifdef __CUDA_ARCH__
-#pragma unroll
+      UNROLL
 #endif
-    for (unsigned i = 0; i < TLC - 1; i++) {
+      for (unsigned i = 0; i < TLC - 1; i++) {
 #ifdef __CUDA_ARCH__
-      r[i] = __funnelshift_rc(x[i], x[i + 1], 1);
+        r[i] = __funnelshift_rc(x[i], x[i + 1], 1);
 #else
-      r[i] = (x[i] >> 1) | (x[i + 1] << 31);
+        r[i] = (x[i] >> 1) | (x[i + 1] << 31);
 #endif
+      }
    }
    r[TLC - 1] = x[TLC - 1] >> 1;
    return sub_modulus<MODULUS_MULTIPLE>(rs);
@@ -969,4 +1003,13 @@ struct std::hash<Field<CONFIG>> {
      hash ^= std::hash<uint32_t>()(key.limbs_storage.limbs[i]) + 0x9e3779b9 + (hash << 6) + (hash >> 2);
    return hash;
  }
+};
+
+template <class CONFIG>
+struct SharedMemory<Field<CONFIG>> {
+  __device__ Field<CONFIG>* getPointer()
+  {
+    extern __shared__ Field<CONFIG> s_scalar_[];
+    return s_scalar_;
+  }
 };
--- a/icicle/include/fields/field_config.cuh
+++ b/icicle/include/fields/field_config.cuh
@@ -0,0 +1,51 @@
+#pragma once
+#ifndef FIELD_CONFIG_H
+#define FIELD_CONFIG_H
+
+#include "fields/id.h"
+#include "fields/field.cuh"
+
+#if FIELD_ID == BN254
+#include "fields/snark_fields/bn254_scalar.cuh"
+using bn254::fp_config;
+#elif FIELD_ID == BLS12_381
+#include "fields/snark_fields/bls12_381_scalar.cuh"
+using bls12_381::fp_config;
+#elif FIELD_ID == BLS12_377
+#include "fields/snark_fields/bls12_377_scalar.cuh"
+using bls12_377::fp_config;
+#elif FIELD_ID == BW6_761
+#include "fields/snark_fields/bls12_377_base.cuh"
+typedef bls12_377::fq_config fp_config;
+#elif FIELD_ID == GRUMPKIN
+#include "fields/snark_fields/bn254_base.cuh"
+typedef bn254::fq_config fp_config;
+
+#elif FIELD_ID == BABY_BEAR
+#include "fields/stark_fields/baby_bear.cuh"
+using baby_bear::fp_config;
+#ifdef EXT_FIELD
+#include "fields/quartic_extension.cuh"
+#endif
+#endif
+
+/**
+ * @namespace field_config
+ * Namespace with type definitions for finite fields. Here, concrete types are created in accordance
+ * with the `-DFIELD` env variable passed during build.
+ */
+namespace field_config {
+  /**
+   * Scalar field. Is always a prime field.
+   */
+  typedef Field<fp_config> scalar_t;
+
+#ifdef EXT_FIELD
+  /**
+   * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is.
+   */
+  typedef ExtensionField<fp_config> extension_t;
+#endif
+} // namespace field_config
+
+#endif
--- a/icicle/include/fields/host_math.cuh
+++ b/icicle/include/fields/host_math.cuh
@@ -4,7 +4,7 @@

 #include <cstdint>
 #include <cuda_runtime.h>
-
+#include "gpu-utils/modifiers.cuh"
 namespace host_math {

  // return x + y with uint32_t operands
@@ -67,9 +67,9 @@ namespace host_math {
  struct carry_chain {
    unsigned index;

-    constexpr __host__ __forceinline__ carry_chain() : index(0) {}
+    constexpr HOST_INLINE carry_chain() : index(0) {}

-    __host__ __forceinline__ uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+    HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
    {
      index++;
      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
@@ -82,7 +82,7 @@ namespace host_math {
        return host_math::addc(x, y, carry);
    }

-    __host__ __forceinline__ uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+    HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
    {
      index++;
      if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
--- a/icicle/include/fields/id.h
+++ b/icicle/include/fields/id.h
@@ -0,0 +1,13 @@
+#pragma once
+#ifndef FIELD_ID_H
+#define FIELD_ID_H
+
+#define BN254     1
+#define BLS12_381 2
+#define BLS12_377 3
+#define BW6_761   4
+#define GRUMPKIN  5
+
+#define BABY_BEAR 1001
+
+#endif
--- a/icicle/include/fields/ptx.cuh
+++ b/icicle/include/fields/ptx.cuh
--- a/icicle/include/fields/quadratic_extension.cuh
+++ b/icicle/include/fields/quadratic_extension.cuh
@@ -1,15 +1,15 @@
 #pragma once

 #include "field.cuh"
-
-#define HOST_INLINE        __host__ __forceinline__
-#define DEVICE_INLINE      __device__ __forceinline__
-#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"

 template <typename CONFIG>
 class ExtensionField
 {
 private:
+  friend Field<CONFIG>;
+
  typedef typename Field<CONFIG>::Wide FWide;

  struct ExtensionWide {
@@ -50,6 +50,12 @@ public:

  static HOST_INLINE ExtensionField rand_host() { return ExtensionField{FF::rand_host(), FF::rand_host()}; }

+  static void RandHostMany(ExtensionField* out, int size)
+  {
+    for (int i = 0; i < size; i++)
+      out[i] = rand_host();
+  }
+
  template <unsigned REDUCTION_SIZE = 1>
  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
  {
@@ -72,15 +78,47 @@ public:
    return ExtensionField{xs.real - ys.real, xs.imaginary - ys.imaginary};
  }

+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs + ys.real, ys.imaginary};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs - ys.real, FF::neg(ys.imaginary)};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real + ys, xs.imaginary};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real - ys, xs.imaginary};
+  }
+
  template <unsigned MODULUS_MULTIPLE = 1>
  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
  {
    FWide real_prod = FF::mul_wide(xs.real, ys.real);
    FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
    FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
-    FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
-    return ExtensionWide{real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod};
+    FWide nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(imaginary_prod);
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FWide::neg(nonresidue_times_im) : nonresidue_times_im;
+    return ExtensionWide{real_prod + nonresidue_times_im, prod_of_sums - real_prod - imaginary_prod};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  {
+    return ExtensionWide{FF::mul_wide(xs.real, ys), FF::mul_wide(xs.imaginary, ys)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  {
+    return mul_wide(ys, xs);
  }

  template <unsigned MODULUS_MULTIPLE = 1>
@@ -90,7 +128,8 @@ public:
      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary)};
  }

-  friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys)
+  template <class T1, class T2>
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
  {
    ExtensionWide xy = mul_wide(xs, ys);
    return reduce(xy);
@@ -114,9 +153,9 @@ public:
    FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
    FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
    FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
-    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-    return ExtensionField{real_prod + i_sq_times_im, re_im + im_re};
+    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(imaginary_prod);
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im;
+    return ExtensionField{real_prod + nonresidue_times_im, re_im + im_re};
  }

  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
@@ -145,14 +184,23 @@ public:
    return ExtensionField{FF::neg(xs.real), FF::neg(xs.imaginary)};
  }

-  // inverse assumes that xs is nonzero
+  // inverse of zero is set to be zero which is what we want most of the time
  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
  {
    ExtensionField xs_conjugate = {xs.real, FF::neg(xs.imaginary)};
-    FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
-    i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
+    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr(xs.imaginary));
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im;
    // TODO: wide here
-    FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
+    FF xs_norm_squared = FF::sqr(xs.real) - nonresidue_times_im;
    return xs_conjugate * ExtensionField{FF::inverse(xs_norm_squared), FF::zero()};
  }
+};
+
+template <class CONFIG>
+struct SharedMemory<ExtensionField<CONFIG>> {
+  __device__ ExtensionField<CONFIG>* getPointer()
+  {
+    extern __shared__ ExtensionField<CONFIG> s_ext2_scalar_[];
+    return s_ext2_scalar_;
+  }
 };
--- a/icicle/include/fields/quartic_extension.cuh
+++ b/icicle/include/fields/quartic_extension.cuh
@@ -0,0 +1,257 @@
+#pragma once
+
+#include "field.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "gpu-utils/sharedmem.cuh"
+
+template <typename CONFIG>
+class ExtensionField
+{
+private:
+  typedef typename Field<CONFIG>::Wide FWide;
+
+  struct ExtensionWide {
+    FWide real;
+    FWide im1;
+    FWide im2;
+    FWide im3;
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
+    }
+
+    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys)
+    {
+      return ExtensionWide{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
+    }
+  };
+
+public:
+  typedef Field<CONFIG> FF;
+  static constexpr unsigned TLC = 4 * CONFIG::limbs_count;
+
+  FF real;
+  FF im1;
+  FF im2;
+  FF im3;
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField zero()
+  {
+    return ExtensionField{FF::zero(), FF::zero(), FF::zero(), FF::zero()};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField one()
+  {
+    return ExtensionField{FF::one(), FF::zero(), FF::zero(), FF::zero()};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField ToMontgomery(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      xs.real * FF{CONFIG::montgomery_r}, xs.im1 * FF{CONFIG::montgomery_r}, xs.im2 * FF{CONFIG::montgomery_r},
+      xs.im3 * FF{CONFIG::montgomery_r}};
+  }
+
+  static constexpr HOST_DEVICE_INLINE ExtensionField FromMontgomery(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      xs.real * FF{CONFIG::montgomery_r_inv}, xs.im1 * FF{CONFIG::montgomery_r_inv},
+      xs.im2 * FF{CONFIG::montgomery_r_inv}, xs.im3 * FF{CONFIG::montgomery_r_inv}};
+  }
+
+  static HOST_INLINE ExtensionField rand_host()
+  {
+    return ExtensionField{FF::rand_host(), FF::rand_host(), FF::rand_host(), FF::rand_host()};
+  }
+
+  static void RandHostMany(ExtensionField* out, int size)
+  {
+    for (int i = 0; i < size; i++)
+      out[i] = rand_host();
+  }
+
+  template <unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField& xs)
+  {
+    return ExtensionField{
+      FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.im1),
+      FF::sub_modulus<REDUCTION_SIZE>(&xs.im2), FF::sub_modulus<REDUCTION_SIZE>(&xs.im3)};
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs)
+  {
+    os << "{ Real: " << xs.real << " }; { Im1: " << xs.im1 << " }; { Im2: " << xs.im2 << " }; { Im3: " << xs.im3
+       << " };";
+    return os;
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real + ys.real, xs.im1 + ys.im1, xs.im2 + ys.im2, xs.im3 + ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs.real - ys.real, xs.im1 - ys.im1, xs.im2 - ys.im2, xs.im3 - ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs + ys.real, ys.im1, ys.im2, ys.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(FF xs, const ExtensionField& ys)
+  {
+    return ExtensionField{xs - ys.real, FF::neg(ys.im1), FF::neg(ys.im2), FF::neg(ys.im3)};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator+(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real + ys, xs.im1, xs.im2, xs.im3};
+  }
+
+  friend HOST_DEVICE_INLINE ExtensionField operator-(ExtensionField xs, const FF& ys)
+  {
+    return ExtensionField{xs.real - ys, xs.im1, xs.im2, xs.im3};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    if (CONFIG::nonresidue_is_negative)
+      return ExtensionWide{
+        FF::mul_wide(xs.real, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(
+            FF::mul_wide(xs.im1, ys.im3) + FF::mul_wide(xs.im2, ys.im2) + FF::mul_wide(xs.im3, ys.im1)),
+        FF::mul_wide(xs.real, ys.im1) + FF::mul_wide(xs.im1, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, ys.im3) + FF::mul_wide(xs.im3, ys.im2)),
+        FF::mul_wide(xs.real, ys.im2) + FF::mul_wide(xs.im1, ys.im1) + FF::mul_wide(xs.im2, ys.real) -
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, ys.im3)),
+        FF::mul_wide(xs.real, ys.im3) + FF::mul_wide(xs.im1, ys.im2) + FF::mul_wide(xs.im2, ys.im1) +
+          FF::mul_wide(xs.im3, ys.real)};
+    else
+      return ExtensionWide{
+        FF::mul_wide(xs.real, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(
+            FF::mul_wide(xs.im1, ys.im3) + FF::mul_wide(xs.im2, ys.im2) + FF::mul_wide(xs.im3, ys.im1)),
+        FF::mul_wide(xs.real, ys.im1) + FF::mul_wide(xs.im1, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, ys.im3) + FF::mul_wide(xs.im3, ys.im2)),
+        FF::mul_wide(xs.real, ys.im2) + FF::mul_wide(xs.im1, ys.im1) + FF::mul_wide(xs.im2, ys.real) +
+          FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, ys.im3)),
+        FF::mul_wide(xs.real, ys.im3) + FF::mul_wide(xs.im1, ys.im2) + FF::mul_wide(xs.im2, ys.im1) +
+          FF::mul_wide(xs.im3, ys.real)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const FF& ys)
+  {
+    return ExtensionWide{
+      FF::mul_wide(xs.real, ys), FF::mul_wide(xs.im1, ys), FF::mul_wide(xs.im2, ys), FF::mul_wide(xs.im3, ys)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const FF& xs, const ExtensionField& ys)
+  {
+    return ExtensionWide{
+      FF::mul_wide(xs, ys.real), FF::mul_wide(xs, ys.im1), FF::mul_wide(xs, ys.im2), FF::mul_wide(xs, ys.im3)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs)
+  {
+    return ExtensionField{
+      FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.im1),
+      FF::template reduce<MODULUS_MULTIPLE>(xs.im2), FF::template reduce<MODULUS_MULTIPLE>(xs.im3)};
+  }
+
+  template <class T1, class T2>
+  friend HOST_DEVICE_INLINE ExtensionField operator*(const T1& xs, const T2& ys)
+  {
+    ExtensionWide xy = mul_wide(xs, ys);
+    return reduce(xy);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys)
+  {
+    return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
+  }
+
+  friend HOST_DEVICE_INLINE bool operator!=(const ExtensionField& xs, const ExtensionField& ys) { return !(xs == ys); }
+
+  template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField mul_unsigned(const ExtensionField& xs)
+  {
+    return {
+      FF::template mul_unsigned<multiplier>(xs.real), FF::template mul_unsigned<multiplier>(xs.im1),
+      FF::template mul_unsigned<multiplier>(xs.im2), FF::template mul_unsigned<multiplier>(xs.im3)};
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionWide sqr_wide(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return mul_wide<MODULUS_MULTIPLE>(xs, xs);
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField sqr(const ExtensionField& xs)
+  {
+    // TODO: change to a more efficient squaring
+    return xs * xs;
+  }
+
+  template <unsigned MODULUS_MULTIPLE = 1>
+  static constexpr HOST_DEVICE_INLINE ExtensionField neg(const ExtensionField& xs)
+  {
+    return {FF::neg(xs.real), FF::neg(xs.im1), FF::neg(xs.im2), FF::neg(xs.im3)};
+  }
+
+  // inverse of zero is set to be zero which is what we want most of the time
+  static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs)
+  {
+    FF x, x0, x2;
+    if (CONFIG::nonresidue_is_negative) {
+      x0 = FF::reduce(
+        FF::sqr_wide(xs.real) +
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im1, xs.im3 + xs.im3) - FF::sqr_wide(xs.im2)));
+      x2 = FF::reduce(
+        FF::mul_wide(xs.real, xs.im2 + xs.im2) - FF::sqr_wide(xs.im1) +
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(xs.im3)));
+      x = FF::reduce(FF::sqr_wide(x0) + FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(x2)));
+    } else {
+      x0 = FF::reduce(
+        FF::sqr_wide(xs.real) -
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im1, xs.im3 + xs.im3) - FF::sqr_wide(xs.im2)));
+      x2 = FF::reduce(
+        FF::mul_wide(xs.real, xs.im2 + xs.im2) - FF::sqr_wide(xs.im1) -
+        FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(xs.im3)));
+      x = FF::reduce(FF::sqr_wide(x0) - FF::template mul_unsigned<CONFIG::nonresidue>(FF::sqr_wide(x2)));
+    }
+    FF x_inv = FF::inverse(x);
+    x0 = x0 * x_inv;
+    x2 = x2 * x_inv;
+    return {
+      FF::reduce(
+        (CONFIG::nonresidue_is_negative
+           ? (FF::mul_wide(xs.real, x0) + FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))
+           : (FF::mul_wide(xs.real, x0)) - FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))),
+      FF::reduce(
+        (CONFIG::nonresidue_is_negative
+           ? FWide::neg(FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, x2)))
+           : FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, x2))) -
+        FF::mul_wide(xs.im1, x0)),
+      FF::reduce(FF::mul_wide(xs.im2, x0) - FF::mul_wide(xs.real, x2)),
+      FF::reduce(FF::mul_wide(xs.im1, x2) - FF::mul_wide(xs.im3, x0)),
+    };
+  }
+};
+
+template <class CONFIG>
+struct SharedMemory<ExtensionField<CONFIG>> {
+  __device__ ExtensionField<CONFIG>* getPointer()
+  {
+    extern __shared__ ExtensionField<CONFIG> s_ext4_scalar_[];
+    return s_ext4_scalar_;
+  }
+};
--- a/icicle/include/fields/snark_fields/bls12_377_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_base.cuh
@@ -1,196 +1,10 @@
 #pragma once
-#ifndef BLS12_377_PARAMS_H
-#define BLS12_377_PARAMS_H
+#ifndef BLS12_377_BASE_PARAMS_H
+#define BLS12_377_BASE_PARAMS_H

-#include "../utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bls12_377 {
-  struct fp_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned omegas_count = 47;
-    static constexpr unsigned modulus_bit_count = 253;
-    static constexpr unsigned num_of_reductions = 1;
-
-    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
-                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
-    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
-                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
-    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
-                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
-    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
-                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
-      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
-      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
-      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
-
-    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
-                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
-                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
-                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
-       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
-       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
-       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
-       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
-       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
-       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
-       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
-       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
-       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
-       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
-       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
-       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
-       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
-       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
-       {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
-       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
-       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
-       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
-       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
-       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
-       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
-       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
-       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
-       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
-       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
-       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
-       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
-       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
-       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
-       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
-       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
-       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
-       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
-       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
-       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
-       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
-       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
-       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
-       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
-       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
-       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
-       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
-       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
-       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
-       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
-      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
-       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
-       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
-       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
-       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
-       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
-       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
-       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
-       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
-       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
-       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
-       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
-       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
-       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
-       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
-       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
-       {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
-       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
-       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
-       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
-       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
-       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
-       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
-       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
-       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
-       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
-       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
-       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
-       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
-       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
-       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
-       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
-       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
-       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
-       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
-       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
-       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
-       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
-       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
-       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
-       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
-       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
-       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
-       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
-       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
-       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
-       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
-
-    static constexpr storage_array<omegas_count, limbs_count> inv = {
-      {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
-       {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06},
-       {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2},
-       {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08},
-       {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33},
-       {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9},
-       {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93},
-       {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9},
-       {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab},
-       {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85},
-       {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1},
-       {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8},
-       {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003},
-       {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1},
-       {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007},
-       {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3},
-       {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08},
-       {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3},
-       {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309},
-       {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433},
-       {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9},
-       {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513},
-       {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539},
-       {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b},
-       {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555},
-       {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559},
-       {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c},
-       {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d},
-       {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
-       {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
-       {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
-       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
-       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
-       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
-       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
-       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
-       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
-       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
-       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
-       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
-       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
-       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
-       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
-       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
-       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
-       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
-       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
-  };
-
  struct fq_config {
    static constexpr unsigned limbs_count = 12;
    static constexpr unsigned omegas_count = 48;
@@ -521,41 +335,11 @@ namespace bls12_377 {
       {0xfffdebde, 0x0ff7ffff, 0x0fffa3d3, 0x8e4c751f, 0x6bcccc32, 0xb7275e5b, 0xdc08ab03, 0x0321276d, 0x28f6304f,
        0xdd22a6ac, 0x17c50a31, 0x01ae3a46}}};

-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 5;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 5;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xb21be9ef, 0xeab9b16e, 0xffcd394e, 0xd5481512,
-                                                               0xbd37cb5c, 0x188282c8, 0xaa9d41bb, 0x85951e2c,
-                                                               0xbf87ff54, 0xc8fc6225, 0xfe740a67, 0x008848de};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x559c8ea6, 0xfd82de55, 0x34a9591a, 0xc2fe3d36,
-                                                               0x4fb82305, 0x6d182ad4, 0xca3e52d9, 0xbd7fb348,
-                                                               0x30afeec4, 0x1f674f5d, 0xc5102eff, 0x01914a69};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0x7c005196, 0x74e3e48f, 0xbb535402, 0x71889f52,
-                                                                  0x57db6b9b, 0x7ea501f5, 0x203e5031, 0xc565f071,
-                                                                  0xa3841d01, 0xc89630a2, 0x71c785fe, 0x018480be};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x6ea16afe, 0xb26bfefa, 0xbff76fe6, 0x5cf89984,
-                                                                  0x0799c9de, 0xe7223ece, 0x6651cecb, 0x532777ee,
-                                                                  0xb1b140d5, 0x70dc5a51, 0xe7004031, 0x00ea6040};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x09fd4ddf, 0xf0940944, 0x6d8c7c2e, 0xf2cf8888,
-                                                                  0xf832d204, 0xe458c282, 0x74b49a58, 0xde03ed72,
-                                                                  0xcbb2efb4, 0xd960736b, 0x5d446f7b, 0x00690d66};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0x85eb8f93, 0xd9a1cdd1, 0x5e52270b, 0x4279b83f,
-                                                                  0xcee304c2, 0x2463b01a, 0x3d591bf1, 0x61ef11ac,
-                                                                  0x151a70aa, 0x9e549da3, 0xd2835518, 0x00f8169f};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x9999999a, 0x1c9ed999, 0x1ccccccd, 0x0dd39e5c, 0x3c6bf800, 0x129207b6,
-    0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
 } // namespace bls12_377

-#endif
+#endif
--- a/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_377_scalar.cuh
@@ -0,0 +1,195 @@
+#pragma once
+#ifndef BLS12_377_SCALAR_PARAMS_H
+#define BLS12_377_SCALAR_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bls12_377 {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned omegas_count = 47;
+    static constexpr unsigned modulus_bit_count = 253;
+    static constexpr unsigned num_of_reductions = 1;
+
+    static constexpr storage<limbs_count> modulus = {0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe,
+                                                     0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e};
+    static constexpr storage<limbs_count> modulus_2 = {0x00000002, 0x14230000, 0xa0000002, 0xb354edfd,
+                                                       0xb86f6002, 0xc1689a3c, 0x34594aac, 0x2556cabd};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000004, 0x28460000, 0x40000004, 0x66a9dbfb,
+                                                       0x70dec005, 0x82d13479, 0x68b29559, 0x4aad957a};
+    static constexpr storage<limbs_count> neg_modulus = {0xffffffff, 0xf5ee7fff, 0x2ffffffe, 0xa6558901,
+                                                         0xa3c84ffe, 0x9f4bb2e1, 0x65d35aa9, 0xed549aa1};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0x00000001, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x00000001, 0x14230000, 0xe0000002, 0xc7dd4d2f, 0x8585d003, 0x08ee1bd4, 0xe57fc56e, 0x7e7557e3,
+      0x483a709d, 0x1fdebb41, 0x5678f4e6, 0x8ea77334, 0xc19c3ec5, 0xd717de29, 0xe2340781, 0x015c8d01};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x00000002, 0x28460000, 0xc0000004, 0x8fba9a5f, 0x0b0ba007, 0x11dc37a9, 0xcaff8adc, 0xfceaafc7,
+      0x9074e13a, 0x3fbd7682, 0xacf1e9cc, 0x1d4ee668, 0x83387d8b, 0xae2fbc53, 0xc4680f03, 0x02b91a03};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x00000004, 0x508c0000, 0x80000008, 0x1f7534bf, 0x1617400f, 0x23b86f52, 0x95ff15b8, 0xf9d55f8f,
+      0x20e9c275, 0x7f7aed05, 0x59e3d398, 0x3a9dccd1, 0x0670fb16, 0x5c5f78a7, 0x88d01e07, 0x05723407};
+
+    static constexpr storage<limbs_count> m = {0x151e79ea, 0xf5204c21, 0x8d69e258, 0xfd0a180b,
+                                               0xfaa80548, 0xe4e51e49, 0xc40b2c9e, 0x36d9491e};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xfffffff3, 0x7d1c7fff, 0x6ffffff2, 0x7257f50f,
+                                                          0x512c0fee, 0x16d81575, 0x2bbb9a9d, 0x0d4bda32};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x1beeec02, 0x4122dd1a, 0x74fee875, 0xbd1eae95,
+                                                              0x27b28e2f, 0x838557e2, 0x2290c02c, 0x07b30191};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000001, 0x8f1a4000, 0xb0000001, 0xcf664765, 0x970dec00, 0x23ed1347, 0x00000000, 0x00000000},
+       {0xfbfa0a01, 0x0f830f7e, 0xd75769a0, 0x20f8b46c, 0xf05d5033, 0x7108bd18, 0x0788de01, 0x07405e08},
+       {0x60b9bdae, 0xc78085a6, 0x789094f5, 0x3116ec22, 0xce87d660, 0x0a02a81d, 0xc2a94856, 0x0ead8236},
+       {0x3e83a7cc, 0x6ffc39d9, 0x958a0a74, 0x117d996e, 0x0b92e8c9, 0xc242289d, 0x29d977d6, 0x0484efb4},
+       {0x0111ec3f, 0x15455b00, 0xc5f6be6f, 0x6b62d7af, 0x337f2d07, 0xfcba0365, 0x43fccd26, 0x0f151842},
+       {0xc31ec69b, 0x57951b2e, 0x2a37ce1f, 0x3e0a4be7, 0xcf3b198a, 0x960aeb4a, 0x341fd5cd, 0x04fb0673},
+       {0xa921851f, 0x71c1b78e, 0x7808f239, 0x3c26340c, 0x976fb990, 0xbcc8f69b, 0xe880dc71, 0x06a5edb2},
+       {0xc0f5679e, 0x7619eab5, 0x0dc0b9cd, 0x1f4cd10e, 0xbf6a480a, 0x7e1b70aa, 0x7f5461bb, 0x0ffc66da},
+       {0xec5cbab2, 0x8159806d, 0x498264a3, 0x14ea1333, 0xe3abfaa6, 0x56bbe1d8, 0x02aa031f, 0x09d2b5c4},
+       {0xc010c48a, 0xd2aa9562, 0x3b004b60, 0x447e5c11, 0x11e243bb, 0xd5a21c13, 0x0ab418b1, 0x01eab23e},
+       {0xacff6986, 0x08715ee8, 0xa93924d0, 0xab01878a, 0x6e9ae5c4, 0xbfbc5e71, 0x26b08d6e, 0x0f8000bf},
+       {0x3ddbc679, 0x06bc13b0, 0x615256ce, 0x7269a1f1, 0x1f5221a2, 0xf7716fbf, 0x8c66c14f, 0x0fa1f02c},
+       {0x906f531f, 0xdd40f131, 0x30728eff, 0xb06b29c7, 0x88839294, 0xc891fd19, 0x646978e8, 0x04e88447},
+       {0x6e259cdc, 0xb1e4b769, 0x00514e5e, 0xbcb0b709, 0x05113e7f, 0x74edb7c0, 0xe92e22af, 0x10c88511},
+       {0x240ede5b, 0xebb2e898, 0x42cd84c6, 0xc2639185, 0x9408f956, 0xf79e8391, 0x94e87a7d, 0x06872fa1},
+       {0x260678ff, 0xf8522249, 0xa8de9973, 0x6148cb16, 0x5a4e8d56, 0x5750f3f4, 0xbaeaf0c3, 0x0e805156},
+       {0x3d766f80, 0x1b4b71cf, 0x1069012d, 0x47d21195, 0x9151ebec, 0x5635235f, 0x2b13c808, 0x093f7d91},
+       {0x4637701d, 0x0848f958, 0x4c8353af, 0x8a750076, 0x0ef6174a, 0x485f4e4f, 0xf38db632, 0x078d97a1},
+       {0x66a16869, 0x50c487c1, 0xd1fd4525, 0x380a66ab, 0x265e8539, 0xd455a01a, 0x064b5334, 0x0cd62875},
+       {0x3358eb25, 0xdbc547bc, 0x722037db, 0x8909d398, 0x5e705b6d, 0x8b7075b5, 0x9bdaf407, 0x02694bb2},
+       {0xf45b9621, 0x102fbfb0, 0xf04faac0, 0xe80f4241, 0x7ca61177, 0x0b830bfd, 0x7033169d, 0x10521892},
+       {0xcc943028, 0xed2576ad, 0xfa4c6090, 0x846e49bc, 0x0049d8e6, 0xc74c1865, 0x665d7be5, 0x0e9c5a12},
+       {0xafeb494b, 0x97319dcd, 0x1d78404c, 0xab30c83e, 0xf26ffe90, 0x452d8a48, 0xa36452c7, 0x0bfc2e92},
+       {0xedc626c3, 0xf30e312d, 0xcf1f3a94, 0x8367a7ca, 0x917a1b28, 0x621e15e1, 0xf2e93b82, 0x07cd59f8},
+       {0xf02ba42c, 0x553085d9, 0x1119b10d, 0x59662159, 0x6b8ea03f, 0xaa670958, 0x7ce92983, 0x066f6f5f},
+       {0x4dd87a5e, 0xf423a283, 0xd9a4c364, 0x1fe46601, 0xbfdc7e9b, 0xda4addbf, 0x3bf94b2b, 0x0a7f2bd8},
+       {0xe5f8848a, 0x270a2326, 0xa727567d, 0x97d14afa, 0x48746fc7, 0x1a3a5a4e, 0xa42f077a, 0x0044e4b1},
+       {0x20b7298a, 0xd7652451, 0x65013b06, 0xc7c9a0b7, 0xad0d8457, 0x479b82a9, 0x0c99f5ce, 0x0bef1e5a},
+       {0x1912f7fa, 0x77d7da1d, 0x299fd7d6, 0xbcb7a5b2, 0x142a4480, 0x705e45dd, 0xb492dbd8, 0x0dc835fd},
+       {0xa0234d2d, 0xe943054c, 0xe5f5be5e, 0x673b0ee0, 0x5048a19a, 0xcdd48e41, 0xabc3cb99, 0x0997d277},
+       {0xa9966ac4, 0x1ae0ea67, 0xda83fb3b, 0x4e2dbb1c, 0x0b51380e, 0xf77cf749, 0xb28a7670, 0x048b4b0e},
+       {0xb14361d4, 0x7f1db43f, 0x25ab6d51, 0x7927e578, 0x383bf21e, 0xb43e52a5, 0xd27fa99f, 0x077595e9},
+       {0xa90a2740, 0xfe3ca4f0, 0x512a7c7a, 0xd259ff36, 0xb41fe696, 0xbca3176a, 0xf33132ce, 0x05bd5ea3},
+       {0xf284f768, 0xdeee484b, 0xe26a0475, 0x2a02e015, 0x88d968c2, 0xf0eb4925, 0x82a391c9, 0x0620ce9e},
+       {0xbd83a3da, 0xd3b69b29, 0xe02ce197, 0x9543950f, 0xc2f87783, 0x80799665, 0xc15be215, 0x11ce8199},
+       {0x1b29736e, 0x8f267f19, 0x1d5a0c3a, 0xa2e04d58, 0x1ae99514, 0x76803064, 0x57f7c806, 0x12129439},
+       {0xf32d6bac, 0xa0b973d4, 0xf0d81b72, 0xae951889, 0x2e2daa0a, 0x51dbe098, 0x40d9af8f, 0x04679474},
+       {0x22df9f13, 0x56313de8, 0x599e7536, 0xe2e75200, 0x6d163e50, 0xa1b4fce7, 0xc8111763, 0x0aec2172},
+       {0x355dd694, 0x4258374d, 0x44c76a20, 0x5c31e8ac, 0xaa5fd062, 0x9b473969, 0x1a37b6b4, 0x0a693d77},
+       {0x44ddbbdc, 0xbafb92a6, 0x26b01974, 0x63c7a02d, 0x5f28a274, 0x0ff86e13, 0x867f2e29, 0x0a7b462a},
+       {0xd5fba57b, 0x90684fea, 0xe0defe98, 0xed237883, 0x030ae924, 0xc502b692, 0xe7a1ec2c, 0x08aa58e8},
+       {0x5e9020dd, 0xade9d4b4, 0x87db8813, 0x489259d2, 0x25051238, 0x5ddce740, 0xb5bc4d11, 0x0c775db1},
+       {0x293f8481, 0xd52cc17a, 0x6f133205, 0x041178fb, 0xb2961832, 0xbbc70d18, 0x481760cd, 0x073d34d1},
+       {0xfdacff58, 0x8215b91d, 0x98331645, 0xd8d9177d, 0x439e803c, 0xe85223ad, 0xcca42c1f, 0x04aa8ef0},
+       {0x01ab3a4d, 0x006f60fa, 0x814ba450, 0xe6600e15, 0xdf9eb147, 0xbde4df36, 0x33760d7b, 0x055d58fa},
+       {0xec2a895e, 0x476ef4a4, 0x63e3f04a, 0x9b506ee3, 0xd1a8a12f, 0x60c69477, 0x0cb92cc1, 0x11d4b7f6}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x00000000, 0x0a118000, 0xd0000001, 0x59aa76fe, 0x5c37b001, 0x60b44d1e, 0x9a2ca556, 0x12ab655e},
+       {0x00000000, 0x7af74000, 0x1fffffff, 0x8a442f99, 0xc529c400, 0x3cc739d6, 0x9a2ca556, 0x12ab655e},
+       {0xd60fb046, 0xc9fa190c, 0xc5b4674e, 0xdb5c179b, 0xbc7b8726, 0x2b2bce0b, 0xbf6e69bf, 0x0e4eb338},
+       {0x8ffc4ed5, 0x74732d1f, 0xb7f2eefc, 0x42d9f590, 0xa24dd4dd, 0xf70461e5, 0xef64676f, 0x03b6eba4},
+       {0x102bbab0, 0x5a21f98a, 0x8d8e2efb, 0xa6a147a9, 0x7612906f, 0x0eb4f005, 0x47d8d2e3, 0x0e1a5481},
+       {0xd01e5aa8, 0x6e509add, 0x6e3f123d, 0xe1582468, 0x8274db24, 0xbd6313ee, 0xd173a634, 0x05d5836e},
+       {0xe975c0cf, 0x6aab3344, 0x6f1dc38e, 0xca362e0e, 0x1dd1743a, 0x2fe72cda, 0xc1b4c4c2, 0x0c1c956e},
+       {0xec89a64f, 0x59fe97a0, 0xe8de5d4c, 0x579617d7, 0xc9c1ea7b, 0x256a305b, 0x53fa131b, 0x01ffae4e},
+       {0x29bcb088, 0x463a73ff, 0xe1438e80, 0xee9e9a5e, 0x3c9369e4, 0x2a00951f, 0x80a32052, 0x09711183},
+       {0x4bec8dd2, 0xa36899db, 0x96393687, 0x2946872e, 0x842df3c8, 0xd4b5734f, 0x5f5cd8fb, 0x0834098f},
+       {0xe3c711b9, 0x4bc485f6, 0x648d1d7e, 0xf43a2598, 0xee88abaa, 0x7f981a0e, 0xec6a3f27, 0x0c88c9c3},
+       {0x49046b52, 0x42bcc6c2, 0x56ab9ecc, 0xcc77294a, 0xe4df3ddd, 0x02ecb41a, 0x67f76726, 0x0e567d22},
+       {0x91c64fc2, 0x1cc56cc3, 0xd16a490b, 0x8cb71e65, 0x14fac366, 0x984be37e, 0xa25d7ba5, 0x0a08e032},
+       {0xd4f5941e, 0x966d9739, 0xe5772a73, 0x5805deb6, 0x5c1f970c, 0xe4eb0d33, 0xbdf35409, 0x039715db},
+       {0xcc6518ac, 0x8419686c, 0x9c7a2366, 0x96dec3a8, 0x71724384, 0xefbfcac6, 0xaf34c239, 0x0c44b99a},
+       {0xc18ff4fd, 0xcb66fe1b, 0x86c8d586, 0x588e18b3, 0x1dfab57c, 0xc6e6d2a3, 0x7d7d4efd, 0x10918ad2},
+       {0x97a18f58, 0x56d6cf22, 0xd0d7abd9, 0x11710758, 0x5eb7a9c5, 0xd1a6608b, 0xc4937e38, 0x04059bdb},
+       {0x4b1b63a9, 0x12998cbc, 0xcf420c9f, 0x0f780c6c, 0x129289ad, 0xa5e48723, 0x240a141d, 0x0a3a1223},
+       {0x00db2b48, 0xa43c0e02, 0x933d10ee, 0x76585489, 0xc0ba6a80, 0x12d64af1, 0x2fad8d8e, 0x01940f43},
+       {0x1d75bec9, 0xe29ef6c0, 0xd4b0183b, 0xead287a2, 0xedfd3795, 0x75a017cf, 0x64427c8e, 0x107f8d0f},
+       {0xa26c8c12, 0xa6f4e1d1, 0xf6610f7e, 0x13571553, 0x56701caf, 0xd95e5df6, 0x2263d69d, 0x050e7b89},
+       {0xc161761f, 0x271d7caf, 0xc369a371, 0xf1001d6f, 0x00e60f51, 0x65286415, 0xb74d14b8, 0x00b918f9},
+       {0x03ad3139, 0x01d3f431, 0xa137ce16, 0xe56f6002, 0x1deb42e8, 0x97f53369, 0xaa37cddd, 0x033fa9ac},
+       {0x60cf1330, 0x840f913b, 0x1df5ed87, 0x5610cde6, 0x72b36ddf, 0x858381b0, 0x6f64e0b7, 0x109bf66c},
+       {0x930cee0b, 0x432d3626, 0xf26e8ba3, 0x55ed3efb, 0x14c5457f, 0x802eebcc, 0xe2310f22, 0x00d300e3},
+       {0x4b9ac952, 0x3d29f5ba, 0xc8ea8f94, 0x7c7f2662, 0xcefc3052, 0x736ccb63, 0x0981f3cb, 0x04bfce2f},
+       {0x5d4e643c, 0x3da791ea, 0x85bff013, 0xb6a956ef, 0xd73de6a3, 0x86c629a8, 0x6b8c48a9, 0x0a5a5f55},
+       {0x49c6284a, 0x9ba6aa00, 0xeacbdc63, 0x0b8429fb, 0xedafdf37, 0x9b9c6c5b, 0xad0c78c6, 0x009907e8},
+       {0x3e47b53f, 0x50380ce2, 0x3a9613fc, 0x6ea3c2d3, 0x4c87ab50, 0xfe743105, 0xd192221c, 0x07871979},
+       {0xe978594b, 0x4ddd3320, 0x3abe3f79, 0xe5f36fbe, 0xe4dcff8e, 0x5dba9ef2, 0x7105148f, 0x0bfc27e2},
+       {0x498fb549, 0xd5993cd5, 0x09da9272, 0x718adcee, 0x72bd5bc0, 0x9e03cbb4, 0xc592813f, 0x07206942},
+       {0x78fd3239, 0xaf29730b, 0x40c3e723, 0xbd907ac9, 0x77f214f7, 0x5dcc0aad, 0xb05fb3a1, 0x02d958da},
+       {0xdf80223d, 0x55f432c9, 0x11a2fed9, 0x23daf2f6, 0x41ae8c34, 0x9e43e003, 0x95f22373, 0x0d51533b},
+       {0x7998b62c, 0xbb53132b, 0x22c9b4aa, 0x064a9186, 0x71d61334, 0xd56de253, 0x04e416f6, 0x10fcf25f},
+       {0xdddb58ec, 0x41f8042f, 0x10886d85, 0x7dd54384, 0x622ff4b4, 0x19544f90, 0x050cc539, 0x02f0b49a},
+       {0xa39b02a3, 0x8a3de898, 0xdc94422c, 0x068b2992, 0xf493db31, 0x1c5f019a, 0x11b0f668, 0x066b1790},
+       {0x78500f1a, 0x98310dd7, 0x735ccb27, 0x1c6050bf, 0xb2081df4, 0x07b6fa7f, 0xfa0f1e20, 0x003edf24},
+       {0x89b0ca6f, 0xb4d938e2, 0x2c897570, 0x0214eb59, 0x2d4cf27a, 0x56c45327, 0x3ed546a4, 0x10a2f358},
+       {0xef01ed78, 0xf2828212, 0xf103c9ca, 0xa66094ac, 0x7a2d5573, 0xdceb481d, 0x8af46aab, 0x0190fcde},
+       {0x526bf9fc, 0x023031cc, 0x79c209ba, 0x0e4136c0, 0x3ec42e5c, 0xe5234df1, 0x1d455234, 0x00cb9592},
+       {0x33bf2a1c, 0x842b0c9c, 0xa29b9236, 0x1fd43c95, 0xc06795d3, 0x6b37a603, 0x0c1b712a, 0x00017b17},
+       {0xaf858193, 0x2b955be2, 0x5fb5e378, 0xa513d8be, 0xa326aeb9, 0x88c4ebeb, 0xf3d45990, 0x00c378e2},
+       {0x6464580f, 0x33e6c8c0, 0x3c4aa09f, 0x9d560eb3, 0xcc98f404, 0xb3f1a899, 0x8ca24b48, 0x012c1ea5},
+       {0xe3b4dc56, 0xa0594a67, 0x91b698e1, 0xc8e6b582, 0x8df78057, 0x711cadbf, 0x396466f8, 0x0049abdf},
+       {0x4ffa086a, 0xecc89610, 0xca06afc6, 0x4db82291, 0x8f3a6426, 0x9ae7c68c, 0x2a874432, 0x0b3dae8c},
+       {0x3b3625b6, 0x1e62401f, 0x28471e5a, 0xd0692164, 0x5cad6b77, 0xb85aa9ec, 0xaa95acf2, 0x063e4b66},
+       {0xb9112c51, 0x2542c2b2, 0x6e23b3ce, 0x36ead8da, 0x76476754, 0x9a268d13, 0xa1ad7cf1, 0x121f44ad}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x00000001, 0x8508c000, 0x68000000, 0xacd53b7f, 0x2e1bd800, 0x305a268f, 0x4d1652ab, 0x0955b2af},
+       {0x00000001, 0xc78d2000, 0x1c000000, 0x033fd93f, 0xc529c401, 0xc88739d6, 0xf3a17c00, 0x0e008c06},
+       {0x00000001, 0xe8cf5000, 0xf6000000, 0x2e75281e, 0x90b0ba01, 0x949dc37a, 0xc6e710ab, 0x1055f8b2},
+       {0x00000001, 0xf9706800, 0xe3000000, 0x440fcf8e, 0x76743501, 0xfaa9084c, 0xb089db00, 0x1180af08},
+       {0x00000001, 0x01c0f400, 0xd9800001, 0x4edd2346, 0x6955f281, 0xadaeaab5, 0xa55b402b, 0x12160a33},
+       {0x00000001, 0x05e93a00, 0xd4c00001, 0x5443cd22, 0xe2c6d141, 0x07317be9, 0x1fc3f2c1, 0x1260b7c9},
+       {0x00000001, 0x07fd5d00, 0xd2600001, 0x56f72210, 0x1f7f40a1, 0xb3f2e484, 0xdcf84c0b, 0x12860e93},
+       {0x00000001, 0x09076e80, 0xd1300001, 0x5850cc87, 0x3ddb7851, 0x0a5398d1, 0x3b9278b1, 0x1298b9f9},
+       {0x00000001, 0x098c7740, 0x50980001, 0x58fda1c3, 0xcd099429, 0xb583f2f7, 0xeadf8f03, 0x12a20fab},
+       {0x00000001, 0x09cefba0, 0x104c0001, 0x59540c61, 0x14a0a215, 0x0b1c200b, 0x42861a2d, 0x12a6ba85},
+       {0x00000001, 0x09f03dd0, 0xf0260001, 0x597f41af, 0xb86c290b, 0xb5e83694, 0xee595fc1, 0x12a90ff1},
+       {0x00000001, 0x0a00dee8, 0x60130001, 0x5994dc57, 0x8a51ec86, 0x0b4e41d9, 0x4443028c, 0x12aa3aa8},
+       {0x00000001, 0x0a092f74, 0x18098001, 0xd99fa9ab, 0xf344ce43, 0x3601477b, 0x6f37d3f1, 0x12aad003},
+       {0x00000001, 0x0a0d57ba, 0xf404c001, 0x99a51054, 0x27be3f22, 0xcb5aca4d, 0x04b23ca3, 0x12ab1ab1},
+       {0x00000001, 0x0a0f6bdd, 0xe2026001, 0xf9a7c3a9, 0xc1faf791, 0x16078bb5, 0xcf6f70fd, 0x12ab4007},
+       {0x80000001, 0x0a1075ee, 0x59013001, 0xa9a91d54, 0x0f1953c9, 0xbb5dec6a, 0x34ce0b29, 0x12ab52b3},
+       {0x40000001, 0x0a10faf7, 0x94809801, 0x81a9ca29, 0x35a881e5, 0x0e091cc4, 0xe77d5840, 0x12ab5c08},
+       {0xa0000001, 0x0a113d7b, 0x32404c01, 0x6daa2094, 0x48f018f3, 0x375eb4f1, 0xc0d4fecb, 0x12ab60b3},
+       {0xd0000001, 0x0a115ebd, 0x81202601, 0x63aa4bc9, 0xd293e47a, 0xcc098107, 0x2d80d210, 0x12ab6309},
+       {0xe8000001, 0x0a116f5e, 0x28901301, 0xdeaa6164, 0x1765ca3d, 0x965ee713, 0xe3d6bbb3, 0x12ab6433},
+       {0x74000001, 0x0a1177af, 0x7c480981, 0x9c2a6c31, 0xb9cebd1f, 0xfb899a18, 0x3f01b084, 0x12ab64c9},
+       {0xba000001, 0x0a117bd7, 0x262404c1, 0x7aea7198, 0x8b033690, 0xae1ef39b, 0xec972aed, 0x12ab6513},
+       {0xdd000001, 0x0a117deb, 0x7b120261, 0xea4a744b, 0xf39d7348, 0x0769a05c, 0x4361e822, 0x12ab6539},
+       {0xee800001, 0x0a117ef5, 0x25890131, 0x21fa75a5, 0xa7ea91a5, 0x340ef6bd, 0xeec746bc, 0x12ab654b},
+       {0xf7400001, 0x0a117f7a, 0xfac48099, 0x3dd27651, 0x021120d3, 0x4a61a1ee, 0x4479f609, 0x12ab6555},
+       {0x7ba00001, 0x0a117fbd, 0x6562404d, 0x4bbe76a8, 0x2f24686a, 0xd58af786, 0xef534daf, 0x12ab6559},
+       {0xbdd00001, 0x0a117fde, 0x9ab12027, 0xd2b476d3, 0x45ae0c35, 0x1b1fa252, 0x44bff983, 0x12ab655c},
+       {0x5ee80001, 0x0a117fef, 0x35589014, 0x962f76e9, 0x50f2de1b, 0xbde9f7b8, 0x6f764f6c, 0x12ab655d},
+       {0xaf740001, 0x8a117ff7, 0x02ac480a, 0x77ecf6f4, 0x5695470e, 0x8f4f226b, 0x04d17a61, 0x12ab655e},
+       {0xd7ba0001, 0xca117ffb, 0x69562405, 0xe8cbb6f9, 0xd9667b87, 0xf801b7c4, 0x4f7f0fdb, 0x12ab655e},
+       {0xebdd0001, 0x6a117ffd, 0x1cab1203, 0xa13b16fc, 0x9acf15c4, 0x2c5b0271, 0x74d5da99, 0x12ab655e},
+       {0xf5ee8001, 0x3a117ffe, 0x76558902, 0xfd72c6fd, 0xfb8362e2, 0xc687a7c7, 0x87813ff7, 0x12ab655e},
+       {0x7af74001, 0xa2117fff, 0x232ac481, 0x2b8e9efe, 0x2bdd8972, 0x139dfa73, 0x90d6f2a7, 0x12ab655e},
+       {0xbd7ba001, 0x56117fff, 0x79956241, 0xc29c8afe, 0xc40a9cb9, 0xba2923c8, 0x9581cbfe, 0x12ab655e},
+       {0xdebdd001, 0x30117fff, 0xa4cab121, 0x8e2380fe, 0x9021265d, 0x8d6eb873, 0x97d738aa, 0x12ab655e},
+       {0xef5ee801, 0x1d117fff, 0xba655891, 0x73e6fbfe, 0xf62c6b2f, 0x771182c8, 0x9901ef00, 0x12ab655e},
+       {0xf7af7401, 0x13917fff, 0xc532ac49, 0x66c8b97e, 0xa9320d98, 0x6be2e7f3, 0x99974a2b, 0x12ab655e},
+       {0xfbd7ba01, 0x0ed17fff, 0xca995625, 0xe039983e, 0x02b4decc, 0xe64b9a89, 0x99e1f7c0, 0x12ab655e},
+       {0xfdebdd01, 0x0c717fff, 0xcd4cab13, 0x1cf2079e, 0xaf764767, 0xa37ff3d3, 0x9a074e8b, 0x12ab655e},
+       {0xfef5ee81, 0x0b417fff, 0xcea6558a, 0x3b4e3f4e, 0x05d6fbb4, 0x021a2079, 0x9a19f9f1, 0x12ab655e},
+       {0xff7af741, 0x8aa97fff, 0xcf532ac5, 0xca7c5b26, 0xb10755da, 0xb16736cb, 0x9a234fa3, 0x12ab655e},
+       {0xffbd7ba1, 0x4a5d7fff, 0xcfa99563, 0x12136912, 0x069f82ee, 0x090dc1f5, 0x9a27fa7d, 0x12ab655e},
+       {0xffdebdd1, 0x2a377fff, 0xcfd4cab2, 0xb5def008, 0xb16b9977, 0xb4e10789, 0x9a2a4fe9, 0x12ab655e},
+       {0xffef5ee9, 0x9a247fff, 0xcfea6559, 0x87c4b383, 0x06d1a4bc, 0x0acaaa54, 0x9a2b7aa0, 0x12ab655e},
+       {0xfff7af75, 0x521affff, 0x4ff532ad, 0xf0b79541, 0x3184aa5e, 0x35bf7bb9, 0x9a2c0ffb, 0x12ab655e},
+       {0xfffbd7bb, 0x2e163fff, 0x0ffa9957, 0x25310620, 0xc6de2d30, 0xcb39e46b, 0x9a2c5aa8, 0x12ab655e},
+       {0xfffdebde, 0x1c13dfff, 0x6ffd4cac, 0xbf6dbe8f, 0x118aee98, 0x95f718c5, 0x9a2c7fff, 0x12ab655e}}};
+  };
+} // namespace bls12_377
+
+#endif
--- a/icicle/include/fields/snark_fields/bls12_381_base.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_base.cuh
@@ -0,0 +1,61 @@
+#pragma once
+#ifndef BLS12_381_BASE_PARAMS_H
+#define BLS12_381_BASE_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bls12_381 {
+  struct fq_config {
+    static constexpr unsigned limbs_count = 12;
+    static constexpr unsigned modulus_bit_count = 381;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
+                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
+    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
+                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
+                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
+    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
+                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
+                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
+    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
+                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
+                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
+      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
+      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
+      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
+      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
+      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
+      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
+      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
+    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
+                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
+                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
+                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
+                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
+                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 1;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace bls12_381
+
+#endif
--- a/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bls12_381_scalar.cuh
@@ -1,8 +1,8 @@
 #pragma once
-#ifndef BLS12_381_PARAMS_H
-#define BLS12_381_PARAMS_H
+#ifndef BLS12_381_SCALAR_PARAMS_H
+#define BLS12_381_SCALAR_PARAMS_H

-#include "../utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bls12_381 {
  struct fp_config {
@@ -145,88 +145,6 @@ namespace bls12_381 {
       {0x00000003, 0x00034801, 0x588313f9, 0x4079f3f8, 0xa32e27f5, 0xdffedd77, 0x41c22ea1, 0x73eda752},
       {0x00000002, 0x0001a400, 0xac40b7fc, 0x4a1bcbfd, 0xd667fffd, 0x099c5abf, 0xb5afd5f5, 0x73eda752}}};
  };
-
-  struct fq_config {
-    static constexpr unsigned limbs_count = 12;
-    static constexpr unsigned modulus_bit_count = 381;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe,
-                                                     0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-                                                     0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea};
-    static constexpr storage<limbs_count> modulus_2 = {0xffff5556, 0x73fdffff, 0x62a7ffff, 0x3d57fffd,
-                                                       0xed61ec48, 0xce61a541, 0xe70a257e, 0xc8ee9709,
-                                                       0x869759ae, 0x96374f6c, 0x72ffcd34, 0x340223d4};
-    static constexpr storage<limbs_count> modulus_4 = {0xfffeaaac, 0xe7fbffff, 0xc54ffffe, 0x7aaffffa,
-                                                       0xdac3d890, 0x9cc34a83, 0xce144afd, 0x91dd2e13,
-                                                       0x0d2eb35d, 0x2c6e9ed9, 0xe5ff9a69, 0x680447a8};
-    static constexpr storage<limbs_count> neg_modulus = {0x00005555, 0x46010000, 0x4eac0000, 0xe1540001,
-                                                         0x094f09db, 0x98cf2d5f, 0x0c7aed40, 0x9b88b47b,
-                                                         0xbcb45328, 0xb4e45849, 0xc6801965, 0xe5feee15};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xffffaaab, 0xb9feffff, 0xb153ffff, 0x1eabfffe, 0xf6b0f624, 0x6730d2a0, 0xf38512bf, 0x64774b84,
-      0x434bacd7, 0x4b1ba7b6, 0x397fe69a, 0x1a0111ea, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x1c718e39, 0x26aa0000, 0x76382eab, 0x7ced6b1d, 0x62113cfd, 0x162c3383, 0x3e71b743, 0x66bf91ed,
-      0x7091a049, 0x292e85a8, 0x86185c7b, 0x1d68619c, 0x0978ef01, 0xf5314933, 0x16ddca6e, 0x50a62cfd,
-      0x349e8bd0, 0x66e59e49, 0x0e7046b4, 0xe2dc90e5, 0xa22f25e9, 0x4bd278ea, 0xb8c35fc7, 0x02a437a4};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x38e31c72, 0x4d540000, 0xec705d56, 0xf9dad63a, 0xc42279fa, 0x2c586706, 0x7ce36e86, 0xcd7f23da,
-      0xe1234092, 0x525d0b50, 0x0c30b8f6, 0x3ad0c339, 0x12f1de02, 0xea629266, 0x2dbb94dd, 0xa14c59fa,
-      0x693d17a0, 0xcdcb3c92, 0x1ce08d68, 0xc5b921ca, 0x445e4bd3, 0x97a4f1d5, 0x7186bf8e, 0x05486f49};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x71c638e4, 0x9aa80000, 0xd8e0baac, 0xf3b5ac75, 0x8844f3f5, 0x58b0ce0d, 0xf9c6dd0c, 0x9afe47b4,
-      0xc2468125, 0xa4ba16a1, 0x186171ec, 0x75a18672, 0x25e3bc04, 0xd4c524cc, 0x5b7729bb, 0x4298b3f4,
-      0xd27a2f41, 0x9b967924, 0x39c11ad1, 0x8b724394, 0x88bc97a7, 0x2f49e3aa, 0xe30d7f1d, 0x0a90de92};
-    static constexpr storage<limbs_count> m = {0xd59646e8, 0xec4f881f, 0x8163c701, 0x4e65c59e, 0x80a19de7, 0x2f7d1dc7,
-                                               0x7fda82a5, 0xa46e09d0, 0x331e9ae8, 0x38a0406c, 0xcf327917, 0x2760d74b};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0x0002fffd, 0x76090000, 0xc40c0002, 0xebf4000b,
-                                                          0x53c758ba, 0x5f489857, 0x70525745, 0x77ce5853,
-                                                          0xa256ec6d, 0x5c071a97, 0xfa80e493, 0x15f65ec3};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x380b4820, 0xf4d38259, 0xd898fafb, 0x7fe11274,
-                                                              0x14956dc8, 0x343ea979, 0x58a88de9, 0x1797ab14,
-                                                              0x3c4f538b, 0xed5e6427, 0xe8fb0ce9, 0x14fec701};
-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 1;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
-  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0xdb22c6bb, 0xfb3af00a, 0xf97a1aef, 0x6c55e83f,
-                                                               0x171bac58, 0xa14e3a3f, 0x9774b905, 0xc3688c4f,
-                                                               0x4fa9ac0f, 0x2695638c, 0x3197d794, 0x17f1d3a7};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x46c5e7e1, 0x0caa2329, 0xa2888ae4, 0xd03cc744,
-                                                               0x2c04b3ed, 0x00db18cb, 0xd5d00af6, 0xfcf5e095,
-                                                               0x741d8ae4, 0xa09e30ed, 0xe3aaa0f1, 0x08b3f481};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xc121bdb8, 0xd48056c8, 0xa805bbef, 0x0bac0326,
-                                                                  0x7ae3d177, 0xb4510b64, 0xfa403b02, 0xc6e47ad4,
-                                                                  0x2dc51051, 0x26080527, 0xf08f0a91, 0x024aa2b2};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0x5d042b7e, 0xe5ac7d05, 0x13945d57, 0x334cf112,
-                                                                  0xdc7f5049, 0xb5da61bb, 0x9920b61a, 0x596bd0d0,
-                                                                  0x88274f65, 0x7dacd3a0, 0x52719f60, 0x13e02b60};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x08b82801, 0xe1935486, 0x3baca289, 0x923ac9cc,
-                                                                  0x5160d12c, 0x6d429a69, 0x8cbdd3a7, 0xadfd9baa,
-                                                                  0xda2e351a, 0x8cc9cdc6, 0x727d6e11, 0x0ce5d527};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xf05f79be, 0xaaa9075f, 0x5cec1da1, 0x3f370d27,
-                                                                  0x572e99ab, 0x267492ab, 0x85a763af, 0xcb3e287e,
-                                                                  0x2bc28b99, 0x32acd2b0, 0x2ea734cc, 0x0606c4a0};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
 } // namespace bls12_381

 #endif
--- a/icicle/include/fields/snark_fields/bn254_base.cuh
+++ b/icicle/include/fields/snark_fields/bn254_base.cuh
@@ -0,0 +1,49 @@
+#pragma once
+#ifndef BN254_BASE_PARAMS_H
+#define BN254_BASE_PARAMS_H
+
+#include "fields/storage.cuh"
+
+namespace bn254 {
+  struct fq_config {
+    static constexpr unsigned limbs_count = 8;
+    static constexpr unsigned modulus_bit_count = 254;
+    static constexpr unsigned num_of_reductions = 1;
+    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
+                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
+                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
+    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
+                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
+    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
+                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
+    static constexpr storage<2 * limbs_count> modulus_wide = {
+      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {
+      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
+      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
+      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
+      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
+      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
+      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
+    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
+                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
+    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
+                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
+                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
+                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 1;
+    // true if nonresidue is negative
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace bn254
+
+#endif
--- a/icicle/include/fields/snark_fields/bn254_scalar.cuh
+++ b/icicle/include/fields/snark_fields/bn254_scalar.cuh
@@ -1,8 +1,8 @@
 #pragma once
-#ifndef BN254_PARAMS_H
-#define BN254_PARAMS_H
+#ifndef BN254_SCALAR_PARAMS_H
+#define BN254_SCALAR_PARAMS_H

-#include "../utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bn254 {
  struct fp_config {
@@ -133,67 +133,6 @@ namespace bn254 {
       {0x73c14d83, 0x0cb3e36b, 0x733c6782, 0xf808dca3, 0x7778a18c, 0x921c407f, 0xd4a7d1cd, 0x30644e6c},
       {0xb1e0a6c2, 0xa84aec7f, 0xf67aec09, 0x101e6275, 0xfc7cfcf5, 0xa536431a, 0xdaecb8fb, 0x30644e6f}}};
  };
-
-  struct fq_config {
-    static constexpr unsigned limbs_count = 8;
-    static constexpr unsigned modulus_bit_count = 254;
-    static constexpr unsigned num_of_reductions = 1;
-    static constexpr storage<limbs_count> modulus = {0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91,
-                                                     0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
-    static constexpr storage<limbs_count> modulus_2 = {0xb0f9fa8e, 0x7841182d, 0xd0e3951a, 0x2f02d522,
-                                                       0x0302b0bb, 0x70a08b6d, 0xc2634053, 0x60c89ce5};
-    static constexpr storage<limbs_count> modulus_4 = {0x61f3f51c, 0xf082305b, 0xa1c72a34, 0x5e05aa45,
-                                                       0x06056176, 0xe14116da, 0x84c680a6, 0xc19139cb};
-    static constexpr storage<limbs_count> neg_modulus = {0x278302b9, 0xc3df73e9, 0x978e3572, 0x687e956e,
-                                                         0x7e7ea7a2, 0x47afba49, 0x1ece5fd6, 0xcf9bb18d};
-    static constexpr storage<2 * limbs_count> modulus_wide = {
-      0xd87cfd47, 0x3c208c16, 0x6871ca8d, 0x97816a91, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72,
-      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<2 * limbs_count> modulus_squared = {
-      0x275d69b1, 0x3b5458a2, 0x09eac101, 0xa602072d, 0x6d96cadc, 0x4a50189c, 0x7a1242c8, 0x04689e95,
-      0x34c6b38d, 0x26edfa5c, 0x16375606, 0xb00b8551, 0x0348d21c, 0x599a6f7c, 0x763cbf9c, 0x0925c4b8};
-    static constexpr storage<2 * limbs_count> modulus_squared_2 = {
-      0x4ebad362, 0x76a8b144, 0x13d58202, 0x4c040e5a, 0xdb2d95b9, 0x94a03138, 0xf4248590, 0x08d13d2a,
-      0x698d671a, 0x4ddbf4b8, 0x2c6eac0c, 0x60170aa2, 0x0691a439, 0xb334def8, 0xec797f38, 0x124b8970};
-    static constexpr storage<2 * limbs_count> modulus_squared_4 = {
-      0x9d75a6c4, 0xed516288, 0x27ab0404, 0x98081cb4, 0xb65b2b72, 0x29406271, 0xe8490b21, 0x11a27a55,
-      0xd31ace34, 0x9bb7e970, 0x58dd5818, 0xc02e1544, 0x0d234872, 0x6669bdf0, 0xd8f2fe71, 0x249712e1};
-    static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17,
-                                               0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
-    static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                 0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                                                  0x00000000, 0x00000000, 0x00000000, 0x00000000};
-    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0x0a78eb28,
-                                                          0x7879462c, 0x666ea36f, 0x9a07df2f, 0x0e0a77c1};
-    static constexpr storage<limbs_count> montgomery_r_inv = {0x014afa37, 0xed84884a, 0x0278edf8, 0xeb202285,
-                                                              0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
-    // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 1;
-    // true if i^2 is negative
-    static constexpr bool i_squared_is_negative = true;
-  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
-                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {0x00000002, 0x00000000, 0x00000000, 0x00000000,
-                                                               0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_re = {0xd992f6ed, 0x46debd5c, 0xf75edadd, 0x674322d4,
-                                                                  0x5e5c4479, 0x426a0066, 0x121f1e76, 0x1800deef};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x_im = {0xaef312c2, 0x97e485b7, 0x35a9e712, 0xf1aa4933,
-                                                                  0x31fb5d25, 0x7260bfb7, 0x920d483a, 0x198e9393};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_re = {0x66fa7daa, 0x4ce6cc01, 0x0c43d37b, 0xe3d1e769,
-                                                                  0x8dcb408f, 0x4aab7180, 0xdb8c6deb, 0x12c85ea5};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y_im = {0xd122975b, 0x55acdadc, 0x70b38ef3, 0xbc4b3133,
-                                                                  0x690c3395, 0xec9e99ad, 0x585ff075, 0x090689d0};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
-                                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_re = {
-    0x24a138e5, 0x3267e6dc, 0x59dbefa3, 0xb5b4c5e5, 0x1be06ac3, 0x81be1899, 0xceb8aaae, 0x2b149d40};
-  static constexpr storage<fq_config::limbs_count> weierstrass_b_g2_im = {
-    0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
 } // namespace bn254

 #endif
--- a/icicle/include/fields/snark_fields/bw6_761_base.cuh
+++ b/icicle/include/fields/snark_fields/bw6_761_base.cuh
@@ -1,8 +1,8 @@
 #pragma once
-#ifndef BW6_761_PARAMS_H
-#define BW6_761_PARAMS_H
+#ifndef BW6_761_BASE_BASE_H
+#define BW6_761_BASE_BASE_H

-#include "../utils/storage.cuh"
+#include "fields/storage.cuh"

 namespace bw6_761 {
  struct fq_config {
@@ -74,33 +74,6 @@ namespace bw6_761 {
      0x7695ef18, 0x5e763565, 0x4fae56bb, 0x226022c2, 0xb70d7652, 0x80e7f067, 0x72116b89, 0x435a8b4a,
      0x5d84e0d4, 0xac258fd6, 0x4427c7b2, 0x47ee8ac5, 0xd04e621b, 0x478c4048, 0x2add3e93, 0x00e0aa7d};
  };
-
-  // G1 and G2 generators
-  static constexpr storage<fq_config::limbs_count> g1_gen_x = {
-    0x66e5b43d, 0x4088f3af, 0xa6af603f, 0x055928ac, 0x56133e82, 0x6750dd03, 0x280ca27f, 0x03758f9a,
-    0xc9ea0971, 0x5bd71fa0, 0x47729b90, 0xa17a54ce, 0x94c2e746, 0x11dbfcd2, 0xc15520ac, 0x79017ffa,
-    0x85f56fc7, 0xee05c54b, 0x551b27f0, 0xe6a0cfb7, 0xa477beae, 0xb277ce98, 0x0ea190c8, 0x01075b02};
-  static constexpr storage<fq_config::limbs_count> g1_gen_y = {
-    0xb4e95363, 0xbafc8f2d, 0x0b20d2a1, 0xad1cb2be, 0xcad0fb93, 0xb2b08119, 0xb3053253, 0x9f9df141,
-    0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1,
-    0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
-  static constexpr storage<fq_config::limbs_count> g2_gen_x = {
-    0xcd025f1c, 0xa830c194, 0xe1bf995b, 0x6410cf4f, 0xc2ad54b0, 0x00e96efb, 0x3cd208d7, 0xce6948cb,
-    0x00e1b6ba, 0x963317a3, 0xac70e7c7, 0xc5bbcae9, 0xf09feb58, 0x734ec3f1, 0xab3da268, 0x26b41c5d,
-    0x13890f6d, 0x4c062010, 0xc5a7115f, 0xd61053aa, 0x69d660f9, 0xc852a82e, 0x41d9b816, 0x01101332};
-  static constexpr storage<fq_config::limbs_count> g2_gen_y = {
-    0x28c73b61, 0xeb70a167, 0xf9eac689, 0x91ec0594, 0x3c5a02a5, 0x58aa2d3a, 0x504affc7, 0x3ea96fcd,
-    0xffa82300, 0x8906c170, 0xd2c712b8, 0x64f293db, 0x33293fef, 0x94c97eb7, 0x0b95a59c, 0x0a1d86c8,
-    0x53ffe316, 0x81a78e27, 0xcec2181c, 0x26b7cf9a, 0xe4b6d2dc, 0x8179eb10, 0x7761369f, 0x0017c335};
-
-  static constexpr storage<fq_config::limbs_count> weierstrass_b = {
-    0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
-    0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
-    0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
-  static constexpr storage<fq_config::limbs_count> g2_weierstrass_b = {
-    0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
 } // namespace bw6_761

-#endif
+#endif
--- a/icicle/include/fields/stark_fields/baby_bear.cuh
+++ b/icicle/include/fields/stark_fields/baby_bear.cuh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "fields/storage.cuh"
+#include "fields/field.cuh"
+
+namespace baby_bear {
+  struct fp_config {
+    static constexpr unsigned limbs_count = 1;
+    static constexpr unsigned omegas_count = 28;
+    static constexpr unsigned modulus_bit_count = 31;
+    static constexpr unsigned num_of_reductions = 1;
+
+    static constexpr storage<limbs_count> modulus = {0x78000001};
+    static constexpr storage<limbs_count> modulus_2 = {0xf0000002};
+    static constexpr storage<limbs_count> modulus_4 = {0x00000000};
+    static constexpr storage<limbs_count> neg_modulus = {0x87ffffff};
+    static constexpr storage<2 * limbs_count> modulus_wide = {0x78000001, 0x00000000};
+    static constexpr storage<2 * limbs_count> modulus_squared = {0xf0000001, 0x38400000};
+    static constexpr storage<2 * limbs_count> modulus_squared_2 = {0xe0000002, 0x70800001};
+    static constexpr storage<2 * limbs_count> modulus_squared_4 = {0xc0000004, 0xe1000003};
+
+    static constexpr storage<limbs_count> m = {0x88888887};
+    static constexpr storage<limbs_count> one = {0x00000001};
+    static constexpr storage<limbs_count> zero = {0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xffffffe};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x38400000};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega = {
+      {{0x78000000}, {0x10faa3e0}, {0x6b615c47}, {0x21ceed5a}, {0x2c1c3348}, {0x36c54c86}, {0x701dd01c},
+       {0x56a9a28e}, {0x03e4cabf}, {0x5bacde79}, {0x1eb53838}, {0x1cd781af}, {0x0961a0b7}, {0x65098a87},
+       {0x77851a0b}, {0x5bcba331}, {0x053fc0f5}, {0x5bf816e5}, {0x4bb124ab}, {0x571e9d4e}, {0x313732cb},
+       {0x28aca172}, {0x4e319b52}, {0x45692d95}, {0x14ff4ba1}, {0x00004951}, {0x00000089}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = {
+      {{0x78000000}, {0x67055c21}, {0x5ee99486}, {0x0bb4c4e4}, {0x4ab33b27}, {0x044b4497}, {0x410e23aa},
+       {0x08a7ee2b}, {0x563cb93d}, {0x3d70b4b7}, {0x77d999f1}, {0x6ceb65b5}, {0x49e7f635}, {0x0eae3a8c},
+       {0x238b8a78}, {0x70d71b0a}, {0x0eaacc45}, {0x5af0f193}, {0x47303308}, {0x573cbfad}, {0x29ff72c0},
+       {0x05af9dac}, {0x00ef24df}, {0x26985530}, {0x22d1ce4b}, {0x08359375}, {0x2cabe994}}};
+
+    static constexpr storage_array<omegas_count, limbs_count> inv = {
+      {{0x3c000001}, {0x5a000001}, {0x69000001}, {0x70800001}, {0x74400001}, {0x76200001}, {0x77100001},
+       {0x77880001}, {0x77c40001}, {0x77e20001}, {0x77f10001}, {0x77f88001}, {0x77fc4001}, {0x77fe2001},
+       {0x77ff1001}, {0x77ff8801}, {0x77ffc401}, {0x77ffe201}, {0x77fff101}, {0x77fff881}, {0x77fffc41},
+       {0x77fffe21}, {0x77ffff11}, {0x77ffff89}, {0x77ffffc5}, {0x77ffffe3}, {0x77fffff2}}};
+
+    // nonresidue to generate the extension field
+    static constexpr uint32_t nonresidue = 11;
+    // true if nonresidue is negative.
+    // TODO: we're very confused by plonky3 and risc0 having different nonresidues: 11 and -11 respectively
+    static constexpr bool nonresidue_is_negative = true;
+  };
+} // namespace baby_bear
--- a/icicle/include/fields/storage.cuh
+++ b/icicle/include/fields/storage.cuh
--- a/icicle/include/gpu-utils/device_context.cu
+++ b/icicle/include/gpu-utils/device_context.cu
--- a/icicle/include/gpu-utils/device_context.cuh
+++ b/icicle/include/gpu-utils/device_context.cuh
--- a/icicle/include/gpu-utils/error_handler.cuh
+++ b/icicle/include/gpu-utils/error_handler.cuh
--- a/icicle/include/gpu-utils/modifiers.cuh
+++ b/icicle/include/gpu-utils/modifiers.cuh
@@ -0,0 +1,11 @@
+#if defined(DEVMODE) || defined(DEBUG)
+#define INLINE_MACRO
+#define UNROLL
+#else
+#define INLINE_MACRO __forceinline__
+#define UNROLL       #pragma unroll
+#endif
+
+#define HOST_INLINE        __host__ INLINE_MACRO
+#define DEVICE_INLINE      __device__ INLINE_MACRO
+#define HOST_DEVICE_INLINE __host__ __device__ INLINE_MACRO
--- a/icicle/include/gpu-utils/sharedmem.cuh
+++ b/icicle/include/gpu-utils/sharedmem.cuh
@@ -1,4 +1,3 @@
-// TODO: remove this file, seems working without it
 // based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/
 // may be outdated, but only worked like that

@@ -59,8 +58,6 @@
 #ifndef _SHAREDMEM_H_
 #define _SHAREDMEM_H_

-#include "../curves/curve_config.cuh"
-
 /** @brief Wrapper class for templatized dynamic shared memory arrays.
 *
 * This struct uses template specialization on the type \a T to declare
@@ -78,8 +75,8 @@ struct SharedMemory {
  //! @returns Pointer to runtime-sized shared memory array
  __device__ T* getPointer()
  {
-    extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
-    Error_UnsupportedType();
+    // extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+    // Error_UnsupportedType();
    return (T*)0;
  }
  // TODO: Use operator overloading to make this class look like a regular array
@@ -215,24 +212,6 @@ struct SharedMemory<uchar4> {
  }
 };

-template <>
-struct SharedMemory<curve_config::scalar_t> {
-  __device__ curve_config::scalar_t* getPointer()
-  {
-    extern __shared__ curve_config::scalar_t s_scalar_[];
-    return s_scalar_;
-  }
-};
-
-template <>
-struct SharedMemory<curve_config::projective_t> {
-  __device__ curve_config::projective_t* getPointer()
-  {
-    extern __shared__ curve_config::projective_t s_projective_[];
-    return s_projective_;
-  }
-};
-
 #endif //_SHAREDMEM_H_

 // Leave this at the end of the file
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -0,0 +1,56 @@
+#pragma once
+#ifndef KECCAK_H
+#define KECCAK_H
+
+#include <cstdint>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+
+namespace keccak {
+  /**
+   * @struct KeccakConfig
+   * Struct that encodes various Keccak parameters.
+   */
+  struct KeccakConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
+    bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
+                    *   function will block the current CPU thread. */
+  };
+
+  KeccakConfig default_keccak_config()
+  {
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
+    KeccakConfig config = {
+      ctx,   // ctx
+      false, // are_inputes_on_device
+      false, // are_outputs_on_device
+      false, // is_async
+    };
+    return config;
+  }
+
+  /**
+   * Compute the keccak hash over a sequence of preimages.
+   * Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
+   * / 64} u64
+   * @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
+   * @tparam D - number of bits of output. Only multiples of 64 are supported.
+   * @param input a pointer to the input data. May be allocated on device or on host, regulated
+   * by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
+   * number_of_blocks)}.
+   * @param input_block_size - size of each input block in bytes. Should be divisible by 8.
+   * @param number_of_blocks number of input and output blocks. One GPU thread processes one block
+   * @param output a pointer to the output data. May be allocated on device or on host, regulated
+   * by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
+   * number_of_blocks)}
+   */
+  template <int C, int D>
+  cudaError_t
+  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
+} // namespace keccak
+
+#endif
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -4,12 +4,11 @@

 #include <cuda_runtime.h>

-#include "../../curves/curve_config.cuh"
-#include "../../primitives/affine.cuh"
-#include "../../primitives/field.cuh"
-#include "../../primitives/projective.cuh"
-#include "../../utils/device_context.cuh"
-#include "../../utils/error_handler.cuh"
+#include "curves/affine.cuh"
+#include "curves/projective.cuh"
+#include "fields/field.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"

 /**
 * @namespace msm
@@ -43,14 +42,18 @@ namespace msm {
                              *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
                              *   points, it should be set to the product of MSM size and [batch_size](@ref
                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
-    int precompute_factor;   /**< The number of extra points to pre-compute for each point. Larger values decrease the
+    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
+                              *   [PrecomputeMSMBases](@ref PrecomputeMSMBases) function, `precompute_factor` passed
+                              *   there needs to be equal to the one used here. Larger values decrease the
                              *   number of computations to make, on-line memory footprint, but increase the static
                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
    int c;                   /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
                              *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
                              *   means more on-line memory footprint but also more parallelism and less computational
-                              *   complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$
-                              *   is chosen automatically). */
+                              *   complexity (up to a certain point). Currently pre-computation is independent of
+                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
+                              *   [PrecomputeMSMBases](@ref PrecomputeMSMBases) function will need to be identical.
+                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
                              *   but if a different (better) upper bound is known, it should be reflected in this
                              *   variable. Default value: 0 (set to the bitsize of scalar field). */
@@ -83,7 +86,7 @@ namespace msm {
   * @return Default value of [MSMConfig](@ref MSMConfig).
   */
  template <typename A>
-  MSMConfig DefaultMSMConfig();
+  MSMConfig DefaultMSMConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context());

  /**
   * A function that computes MSM: \f$ MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i \f$.
@@ -101,12 +104,39 @@ namespace msm {
   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase.
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   *
-   * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
-   * effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
-   * Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
   */
  template <typename S, typename A, typename P>
-  cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);
+  cudaError_t MSM(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
+
+  /**
+   * A function that precomputes MSM bases by extending them with their shifted copies.
+   * e.g.:
+   * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
+   * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
+   * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
+   * @param bases Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+   * @param bases_size Number of bases.
+   * @param precompute_factor The number of total precomputed points for each base (including the base itself).
+   * @param _c This is currently unused, but in the future precomputation will need to be aware of
+   * the `c` value used in MSM (see [MSMConfig](@ref MSMConfig)). So to avoid breaking your code with this
+   * upcoming change, make sure to use the same value of `c` in this function and in respective MSMConfig.
+   * @param are_bases_on_device Whether the bases are on device.
+   * @param ctx Device context specifying device id and stream to use.
+   * @param output_bases Device-allocated buffer of size bases_size * precompute_factor for the extended bases.
+   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   *
+   */
+  template <typename A, typename P>
+  cudaError_t PrecomputeMSMBases(
+    A* bases,
+    int bases_size,
+    int precompute_factor,
+    int _c,
+    bool are_bases_on_device,
+    device_context::DeviceContext& ctx,
+    A* output_bases);

 } // namespace msm

--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -4,10 +4,9 @@

 #include <cuda_runtime.h>

-#include "curves/curve_config.cuh"
-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
-#include "utils/sharedmem.cuh"
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/sharedmem.cuh"
 #include "utils/utils_kernels.cuh"
 #include "utils/utils.h"

@@ -40,6 +39,31 @@ namespace ntt {
  template <typename S>
  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

+  /**
+   * Releases and deallocates resources associated with the domain initialized for performing NTTs.
+   * This function should be called to clean up resources once they are no longer needed.
+   * It's important to note that after calling this function, any operation that relies on the released domain will
+   * fail unless InitDomain is called again to reinitialize the resources. Therefore, ensure that ReleaseDomain is
+   * only called when the operations requiring the NTT domain are completely finished and the domain is no longer
+   * needed.
+   * Also note that it is releasing the domain associated to the specific device.
+   * @param ctx Details related to the device context such as its id and stream id.
+   * @return `cudaSuccess` if the resource release was successful, indicating that the domain and its associated
+   * resources have been properly deallocated. Returns an error code otherwise, indicating failure to release
+   * the resources. The error code can be used to diagnose the problem.
+   * */
+  template <typename S>
+  cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);
+
+  /* Returns the basic root of unity Wn corresponding to the basic root used to initialize the domain.
+   * Useful when computing NTT on cosets. In that case we must use the root W_2n that is between W_n and W_n+1.
+   * @param logn log size of the required root.
+   * @param ctx Details related to the device such as its id and stream id.
+   * @return Wn root of unity corresponding to logn and the basic root used for initDomain(root)
+   */
+  template <typename S>
+  S GetRootOfUnity(uint64_t logn, device_context::DeviceContext& ctx);
+
  /**
   * @enum NTTDir
   * Whether to perform normal forward NTT, or inverse NTT (iNTT). Mathematically, forward NTT computes polynomial
@@ -95,6 +119,8 @@ namespace ntt {
    S coset_gen;                       /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()`
                                        *   (corresponding to no coset being used). */
    int batch_size;                    /**< The number of NTTs to compute. Default value: 1. */
+    bool columns_batch;                /**< True if the batches are the columns of an input matrix
+                                       (they are strided in memory with a stride of ntt size) Default value: false.  */
    Ordering ordering;          /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value:
                                 *   `Ordering::kNN`. */
    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
@@ -112,7 +138,8 @@ namespace ntt {
   * @return Default value of [NTTConfig](@ref NTTConfig).
   */
  template <typename S>
-  NTTConfig<S> DefaultNTTConfig();
+  NTTConfig<S>
+  DefaultNTTConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context());

  /**
   * A function that computes NTT or iNTT in-place. It's necessary to call [InitDomain](@ref InitDomain) with an
@@ -132,7 +159,7 @@ namespace ntt {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename S, typename E>
-  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);
+  cudaError_t NTT(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);

 } // namespace ntt

--- a/icicle/appUtils/ntt/ntt_impl.cuh
+++ b/icicle/appUtils/ntt/ntt_impl.cuh
@@ -3,9 +3,9 @@
 #define _NTT_IMPL_H

 #include <stdint.h>
-#include "appUtils/ntt/ntt.cuh" // for enum Ordering
+#include "ntt/ntt.cuh" // for enum Ordering

-namespace ntt {
+namespace mxntt {

  template <typename S>
  cudaError_t generate_external_twiddles_generic(
@@ -27,7 +27,7 @@ namespace ntt {

  template <typename E, typename S>
  cudaError_t mixed_radix_ntt(
-    E* d_input,
+    const E* d_input,
    E* d_output,
    S* external_twiddles,
    S* internal_twiddles,
@@ -35,12 +35,13 @@ namespace ntt {
    int ntt_size,
    int max_logn,
    int batch_size,
+    bool columns_batch,
    bool is_inverse,
    bool fast_tw,
-    Ordering ordering,
+    ntt::Ordering ordering,
    S* arbitrary_coset,
    int coset_gen_index,
    cudaStream_t cuda_stream);

-} // namespace ntt
+} // namespace mxntt
 #endif //_NTT_IMPL_H
--- a/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
+++ b/icicle/include/polynomials/cuda_backend/polynomial_cuda_backend.cuh
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "gpu-utils/device_context.cuh"
+#include "fields/field_config.cuh"
+#include "polynomials/polynomials.h"
+
+using device_context::DeviceContext;
+
+namespace polynomials {
+  template <typename C = scalar_t, typename D = C, typename I = C>
+  class CUDAPolynomialFactory : public AbstractPolynomialFactory<C, D, I>
+  {
+    std::vector<DeviceContext> m_device_contexts; // device-id --> device context
+    std::vector<cudaStream_t> m_device_streams;   // device-id --> device stream. Storing the streams here as workaround
+                                                  // since DeviceContext has a reference to a stream.
+
+  public:
+    CUDAPolynomialFactory();
+    ~CUDAPolynomialFactory();
+    std::shared_ptr<IPolynomialContext<C, D, I>> create_context() override;
+    std::shared_ptr<IPolynomialBackend<C, D, I>> create_backend() override;
+  };
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_abstract_factory.h
+++ b/icicle/include/polynomials/polynomial_abstract_factory.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "polynomial_context.h"
+#include "polynomial_backend.h"
+#include <memory> // For std::shared_ptr
+
+namespace polynomials {
+
+  /**
+   * @brief Abstract factory for creating polynomial contexts and backends.
+   *
+   * The `AbstractPolynomialFactory` serves as an interface for factories capable of creating
+   * instances of `IPolynomialContext` and `IPolynomialBackend`. This design allows for the
+   * decoupling of object creation from their usage, facilitating the implementation of various
+   * computational strategies (e.g., GPU, ZPU) without altering client code. Each concrete factory
+   * is expected to provide tailored implementations of polynomial contexts and backends that
+   * are optimized for specific computational environments.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class AbstractPolynomialFactory
+  {
+  public:
+    /**
+     * @brief Creates and returns a shared pointer to an `IPolynomialContext` instance.
+     *
+     * @return std::shared_ptr<IPolynomialContext<C, D, I>> A shared pointer to the created
+     *         polynomial context instance.
+     */
+    virtual std::shared_ptr<IPolynomialContext<C, D, I>> create_context() = 0;
+
+    /**
+     * @brief Creates and returns a shared pointer to an `IPolynomialBackend` instance.
+     *
+     * @return std::shared_ptr<IPolynomialBackend<C, D, I>> A shared pointer to the created
+     *         polynomial backend instance.
+     */
+    virtual std::shared_ptr<IPolynomialBackend<C, D, I>> create_backend() = 0;
+
+    /**
+     * @brief Virtual destructor for the `AbstractPolynomialFactory`.
+     */
+    virtual ~AbstractPolynomialFactory() = default;
+  };
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_backend.h
+++ b/icicle/include/polynomials/polynomial_backend.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <cstdint> // for uint64_t, int64_t
+
+namespace polynomials {
+
+  /**
+   * @brief Interface for the polynomial computational backend.
+   *
+   * The `IPolynomialBackend` interface defines the set of operations for polynomial arithmetic
+   * and manipulation that can be performed on a given computational device or platform (e.g., GPU, ZPU).
+   * This interface abstracts the computational logic, allowing for implementation-specific optimizations
+   * and hardware utilization. It interacts closely with `IPolynomialContext` to manage polynomial data
+   * states and perform computations.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class IPolynomialBackend
+  {
+  public:
+    IPolynomialBackend() = default;
+    virtual ~IPolynomialBackend() = default;
+
+    typedef std::shared_ptr<IPolynomialContext<C, D, I>> PolyContext;
+
+    // Initialization methods
+    virtual void from_coefficients(PolyContext p, uint64_t nof_coefficients, const C* coefficients = nullptr) = 0;
+    virtual void from_rou_evaluations(PolyContext p, uint64_t nof_evaluations, const I* evaluations = nullptr) = 0;
+    virtual void clone(PolyContext out, PolyContext in) = 0;
+
+    // Arithmetic operations
+    virtual void add(PolyContext& out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void subtract(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void multiply(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void multiply(PolyContext out, PolyContext p, D scalar) = 0; // scalar multiplication
+    virtual void divide(PolyContext Quotient_out, PolyContext Remainder_out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void quotient(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void remainder(PolyContext out, PolyContext op_a, PolyContext op_b) = 0;
+    virtual void divide_by_vanishing_polynomial(PolyContext out, PolyContext op_a, uint64_t vanishing_poly_degree) = 0;
+
+    // Operations specific to monomials
+    virtual void add_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) = 0;
+    virtual void sub_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) = 0;
+
+    // Utility methods
+    virtual void slice(PolyContext out, PolyContext in, uint64_t offset, uint64_t stride, uint64_t size) = 0;
+    virtual int64_t degree(PolyContext op) = 0;
+
+    // Method to access mutable storage within the context
+    void* get_context_storage_mutable(PolyContext ctxt) { return ctxt->get_storage_mutable(); }
+    const void* get_context_storage_immutable(PolyContext ctxt) { return ctxt->get_storage_immutable(); }
+
+    // Evaluation methods
+    virtual I evaluate(PolyContext op, const D& domain_x) = 0;
+    virtual void evaluate_on_domain(PolyContext op, const D* domain, uint64_t size, I* evaluations /*OUT*/) = 0;
+
+    // Methods to copy coefficients to host memory
+    virtual C copy_coefficient_to_host(PolyContext op, uint64_t coeff_idx) = 0;
+    virtual int64_t
+    copy_coefficients_to_host(PolyContext op, C* host_coeffs, int64_t start_idx = 0, int64_t end_idx = -1) = 0;
+
+    // Methods to get views of coefficients and evaluations, including device id
+    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_coefficients_view(PolyContext p) = 0;
+    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+  };
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <utility>   // for std::pair
+#include <tuple>     // for std::tuple
+#include <iostream>  // for std::ostream
+#include <algorithm> // for std::max
+#include <cstdint>   // for uint64_t, etc.
+#include <memory>
+#include "utils/integrity_pointer.h"
+
+namespace polynomials {
+
+  template <typename Coeff, typename Domain, typename Image>
+  class IPolynomialBackend;
+
+  /**
+   * @brief Interface for polynomial context, encapsulating state, memory, and device context.
+   *
+   * This interface is designed to manage the state of polynomials including their coefficients and
+   * evaluations in both natural and reversed order. It supports operations for converting between
+   * these forms, allocating and releasing resources, and accessing the underlying data. The context
+   * abstracts over the specifics of memory and execution context, allowing polynomials to be managed
+   * in a way that is agnostic to the underlying hardware or software infrastructure.
+   *
+   * @tparam C Type of the coefficients.
+   * @tparam D Domain type, representing the input space of the polynomial.
+   * @tparam I Image type, representing the output space of the polynomial.
+   */
+  template <typename C, typename D, typename I>
+  class IPolynomialContext
+  {
+  public:
+    friend class IPolynomialBackend<C, D, I>;
+
+    // Enumerates the possible states of a polynomial context.
+    enum State { Invalid, Coefficients, EvaluationsOnRou_Natural, EvaluationsOnRou_Reversed };
+
+    // The size of the largest element among coefficients and evaluations.
+    static constexpr size_t ElementSize = std::max(sizeof(C), sizeof(I));
+
+    /**
+     * @brief Construct a new IPolynomialContext object.
+     */
+    IPolynomialContext() : m_id{s_id++} {}
+
+    /**
+     * @brief Virtual destructor for IPolynomialContext.
+     */
+    virtual ~IPolynomialContext() = default;
+
+    // Methods for initializing the context from coefficients or evaluations.
+    virtual void from_coefficients(uint64_t nof_coefficients, const C* coefficients = nullptr) = 0;
+    virtual void from_rou_evaluations(uint64_t nof_evaluations, const I* evaluations = nullptr) = 0;
+
+    // Method for cloning the context from another instance.
+    virtual void clone(IPolynomialContext& from) = 0;
+
+    // Methods for resource management.
+    virtual void allocate(uint64_t nof_elements, State init_state = State::Coefficients, bool memset_zeros = true) = 0;
+    virtual void release() = 0;
+
+    // Methods for transforming between coefficients and evaluations.
+    virtual void transform_to_coefficients(uint64_t nof_coefficients = 0) = 0;
+    virtual void transform_to_evaluations(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+
+    // Accessors for the state and number of elements.
+    virtual State get_state() const = 0;
+    virtual uint64_t get_nof_elements() const = 0;
+
+    // Methods to get direct access to coefficients and evaluations.
+    virtual std::pair<const C*, uint64_t> get_coefficients() = 0;
+    virtual std::pair<const I*, uint64_t> get_rou_evaluations() = 0;
+
+    // Methods to get views of coefficients and evaluations, including device id.
+    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view() = 0;
+    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
+
+    // Method for printing the context state to an output stream.
+    virtual void print(std::ostream& os) = 0;
+
+  protected:
+    // Provides mutable access to the underlying storage for backend computations.
+    virtual void* get_storage_mutable() = 0;
+    virtual const void* get_storage_immutable() = 0;
+
+    // Static and instance variables for debug id management.
+    static inline uint64_t s_id = 0; // Global id counter.
+
+  public:
+    const uint64_t m_id;
+  };
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include "utils/integrity_pointer.h"
+#include "fields/field_config.cuh"
+
+#include "polynomial_context.h"
+#include "polynomial_backend.h"
+#include "polynomial_abstract_factory.h"
+
+using namespace field_config;
+
+namespace polynomials {
+
+  /**
+   * @brief Represents a polynomial and provides operations for polynomial arithmetic, evaluation, and manipulation.
+   *
+   * This class models a polynomial with coefficients of type `Coeff`, defined over a domain `Domain` and producing
+   * outputs of type `Image`. It supports a range of operations including basic arithmetic (addition, subtraction,
+   * multiplication, division), evaluation at points or over domains, and manipulation (slicing, adding monomials).
+   * The implementation abstracts over the specifics of computation and storage through the use of an abstract factory,
+   * contexts, and backends, allowing for efficient execution across various computational environments.
+   *
+   * @tparam Coeff Type of the coefficients of the polynomial.
+   * @tparam Domain Type representing the input space of the polynomial (defaults to `Coeff`).
+   * @tparam Image Type representing the output space of the polynomial (defaults to `Coeff`).
+   */
+  template <typename Coeff, typename Domain = Coeff, typename Image = Coeff>
+  class Polynomial
+  {
+  public:
+    // Initialization (coefficients/evaluations can reside on host or device)
+    static Polynomial from_coefficients(const Coeff* coefficients, uint64_t nof_coefficients);
+    static Polynomial from_rou_evaluations(const Image* evaluations, uint64_t nof_evaluations);
+
+    // Clone the polynomial
+    Polynomial clone() const;
+
+    // Arithmetic ops
+    Polynomial operator+(const Polynomial& rhs) const;
+    Polynomial& operator+=(const Polynomial& rhs);
+
+    Polynomial operator-(const Polynomial& rhs) const;
+
+    Polynomial operator*(const Polynomial& rhs) const;
+    Polynomial operator*(const Domain& scalar) const; // scalar multiplication
+    template <typename C, typename D, typename I>
+    friend Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs);
+
+    std::pair<Polynomial, Polynomial> divide(const Polynomial& rhs) const; //  returns (Q(x), R(x))
+    Polynomial operator/(const Polynomial& rhs) const; // returns Quotient Q(x) for A(x) = Q(x)B(x) + R(x)
+    Polynomial operator%(const Polynomial& rhs) const; // returns Remainder R(x) for A(x) = Q(x)B(x) + R(x)
+    Polynomial divide_by_vanishing_polynomial(uint64_t degree) const;
+
+    // arithmetic ops with monomial
+    Polynomial& add_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+    Polynomial& sub_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
+
+    // Slicing and selecting even or odd components.
+    Polynomial slice(uint64_t offset, uint64_t stride, uint64_t size = 0 /*0 means take all elements*/);
+    Polynomial even();
+    Polynomial odd();
+
+    // Note: Following ops cannot be traced. Calling them invokes polynomial evaluation
+
+    // Evaluation methods
+    Image operator()(const Domain& x) const;
+    Image evaluate(const Domain& x) const;
+    void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+
+    // Method to obtain the degree of the polynomial
+    int64_t degree();
+
+    // Methods for copying coefficients to host memory.
+    Coeff copy_coefficient_to_host(uint64_t idx) const; // single coefficient
+    // caller is allocating output memory. If coeff==nullptr, returning nof_coeff only
+    int64_t copy_coefficients_to_host(Coeff* host_coeffs = nullptr, int64_t start_idx = 0, int64_t end_idx = -1) const;
+
+    // Methods for obtaining a view of the coefficients or evaluations
+    std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
+    std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
+    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
+
+    // Overload stream insertion operator for printing.
+    friend std::ostream& operator<<(std::ostream& os, Polynomial& poly)
+    {
+      poly.m_context->print(os);
+      return os;
+    }
+
+    // Static method to initialize the polynomial class with a factory for context and backend creation.
+    static void initialize(std::shared_ptr<AbstractPolynomialFactory<Coeff, Domain, Image>> factory)
+    {
+      std::atexit(cleanup);
+      s_factory = factory;
+    }
+
+    // Cleanup method for releasing factory resources.
+    static void cleanup() { s_factory = nullptr; }
+
+  private:
+    // The context of the polynomial, encapsulating its state.
+    std::shared_ptr<IPolynomialContext<Coeff, Domain, Image>> m_context = nullptr;
+    // The computational backend for the polynomial operations.
+    std::shared_ptr<IPolynomialBackend<Coeff, Domain, Image>> m_backend = nullptr;
+
+    // Factory for constructing the context and backend instances.
+    static inline std::shared_ptr<AbstractPolynomialFactory<Coeff, Domain, Image>> s_factory = nullptr;
+
+  public:
+    Polynomial();
+    ~Polynomial() = default;
+
+    // Ensures polynomials can be moved but not copied, to manage resources efficiently.
+    Polynomial(Polynomial&&) = default;
+    Polynomial& operator=(Polynomial&&) = default;
+    Polynomial(const Polynomial&) = delete;
+    Polynomial& operator=(const Polynomial&) = delete;
+
+    std::shared_ptr<IPolynomialContext<Coeff, Domain, Image>> get_context() { return m_context; }
+  };
+
+  // explicit instantiation
+
+  // Friend operator to allow multiplication with a scalar from the left-hand side
+  template <typename C = scalar_t, typename D = C, typename I = C>
+  Polynomial<C, D, I> operator*(const D& scalar, const Polynomial<C, D, I>& rhs);
+
+  // External template instantiation to ensure the template is compiled for specific types.
+  extern template class Polynomial<scalar_t>;
+
+} // namespace polynomials
--- a/icicle/include/polynomials/polynomials_c_api.h
+++ b/icicle/include/polynomials/polynomials_c_api.h
@@ -0,0 +1,247 @@
+#pragma once
+
+#include "polynomials.h"
+#include "fields/field_config.cuh"
+#include "utils/utils.h"
+#include "utils/integrity_pointer.h"
+
+namespace polynomials {
+  extern "C" {
+
+  // Defines a polynomial instance based on the scalar type from the FIELD configuration.
+  typedef Polynomial<scalar_t> PolynomialInst;
+
+  // Constructs a polynomial from a set of coefficients.
+  // coeffs: Array of coefficients.
+  // size: Number of coefficients in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_coefficients)(scalar_t* coeffs, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_coefficients(coeffs, size));
+    return result;
+  }
+
+  // Constructs a polynomial from evaluations at the roots of unity.
+  // evals: Array of evaluations.
+  // size: Number of evaluations in the array.
+  // Returns a pointer to the newly created polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_create_from_rou_evaluations)(scalar_t* evals, size_t size)
+  {
+    auto result = new PolynomialInst(PolynomialInst::from_rou_evaluations(evals, size));
+    return result;
+  }
+
+  // Clones an existing polynomial instance.
+  // p: Pointer to the polynomial instance to clone.
+  // Returns a pointer to the cloned polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_clone)(const PolynomialInst* p)
+  {
+    auto result = new PolynomialInst(p->clone());
+    return result;
+  }
+
+  // Deletes a polynomial instance, freeing its memory.
+  // instance: Pointer to the polynomial instance to delete.
+  void CONCAT_EXPAND(FIELD, polynomial_delete)(PolynomialInst* instance) { delete instance; }
+
+  // Adds two polynomials.
+  // a, b: Pointers to the polynomial instances to add.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_add)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a + *b));
+    return result;
+  }
+
+  // Adds a polynomial to another in place.
+  // a: Pointer to the polynomial to add to.
+  // b: Pointer to the polynomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_inplace)(PolynomialInst* a, const PolynomialInst* b) { *a += *b; }
+
+  // Subtracts one polynomial from another.
+  // a, b: Pointers to the polynomial instances (minuend and subtrahend, respectively).
+  // Returns a pointer to the resulting polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_subtract)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a - *b));
+    return result;
+  }
+
+  // Multiplies two polynomials.
+  // a, b: Pointers to the polynomial instances to multiply.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a * *b));
+    return result;
+  }
+
+  // Multiplies a polynomial by a scalar coefficient.
+  // a: Pointer to the polynomial instance.
+  // coeff: Scalar coefficient to multiply by.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_multiply_by_coeff)(const PolynomialInst* a, const scalar_t* coeff)
+  {
+    auto result = new PolynomialInst(std::move(*a * *coeff));
+    return result;
+  }
+
+  // Divides one polynomial by another, returning both quotient and remainder.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // q: Output parameter for the quotient.
+  // r: Output parameter for the remainder.
+  void CONCAT_EXPAND(FIELD, polynomial_division)(
+    const PolynomialInst* a, const PolynomialInst* b, PolynomialInst** q /*OUT*/, PolynomialInst** r /*OUT*/)
+  {
+    auto [_q, _r] = a->divide(*b);
+    *q = new PolynomialInst(std::move(_q));
+    *r = new PolynomialInst(std::move(_r));
+  }
+
+  // Calculates the quotient of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting quotient polynomial instance.
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_quotient)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a / *b));
+    return result;
+  }
+
+  // Calculates the remainder of dividing one polynomial by another.
+  // a, b: Pointers to the polynomial instances (dividend and divisor, respectively).
+  // Returns a pointer to the resulting remainder polynomial instance.
+
+  PolynomialInst* CONCAT_EXPAND(FIELD, polynomial_remainder)(const PolynomialInst* a, const PolynomialInst* b)
+  {
+    auto result = new PolynomialInst(std::move(*a % *b));
+    return result;
+  }
+
+  // Divides a polynomial by a vanishing polynomial of a given degree, over rou domain.
+  // p: Pointer to the polynomial instance.
+  // vanishing_poly_degree: Degree of the vanishing polynomial.
+  // Returns a pointer to the resulting polynomial instance.
+  PolynomialInst*
+  CONCAT_EXPAND(FIELD, polynomial_divide_by_vanishing)(const PolynomialInst* p, uint64_t vanishing_poly_degree)
+  {
+    auto result = new PolynomialInst(std::move(p->divide_by_vanishing_polynomial(vanishing_poly_degree)));
+    return result;
+  }
+
+  // Adds a monomial to a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to add.
+  // monomial: Degree of the monomial to add.
+  void CONCAT_EXPAND(FIELD, polynomial_add_monomial_inplace)(
+    PolynomialInst* p, const scalar_t* monomial_coeff, uint64_t monomial)
+  {
+    p->add_monomial_inplace(*monomial_coeff, monomial);
+  }
+
+  // Subtracts a monomial from a polynomial in place.
+  // p: Pointer to the polynomial instance.
+  // monomial_coeff: Coefficient of the monomial to subtract.
+  // monomial: Degree of the monomial to subtract.
+  void CONCAT_EXPAND(FIELD, polynomial_sub_monomial_inplace)(
+    PolynomialInst* p, const scalar_t* monomial_coeff, uint64_t monomial)
+  {
+    p->sub_monomial_inplace(*monomial_coeff, monomial);
+  }
+
+  // Evaluates a polynomial at a given point.
+  // p: Pointer to the polynomial instance.
+  // x: Point at which to evaluate the polynomial.
+  // Returns the evaluation result.
+  scalar_t CONCAT_EXPAND(FIELD, polynomial_evaluate)(const PolynomialInst* p, const scalar_t& x)
+  {
+    return p->evaluate(x);
+  }
+
+  // Evaluates a polynomial on a domain of points.
+  // p: Pointer to the polynomial instance.
+  // domain: Array of points constituting the domain.
+  // domain_size: Number of points in the domain.
+  // evals: Output array for the evaluations.
+  void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_domain)(
+    const PolynomialInst* p, scalar_t* domain, uint64_t domain_size, scalar_t* evals /*OUT*/)
+  {
+    return p->evaluate_on_domain(domain, domain_size, evals);
+  }
+
+  // Returns the degree of a polynomial.
+  // p: Pointer to the polynomial instance.
+  // Returns the degree of the polynomial.
+  int64_t CONCAT_EXPAND(FIELD, polynomial_degree)(PolynomialInst* p) { return p->degree(); }
+
+  // Copies a single coefficient of a polynomial to host memory.
+  // p: Pointer to the polynomial instance.
+  // idx: Index of the coefficient to copy.
+  // Returns the coefficient value.
+  scalar_t CONCAT_EXPAND(FIELD, polynomial_copy_single_coeff_to_host)(PolynomialInst* p, uint64_t idx)
+  {
+    return p->copy_coefficient_to_host(idx);
+  }
+
+  // Copies a range of polynomial coefficients to host memory.
+  // p: Pointer to the polynomial instance.
+  // host_memory: Array to copy the coefficients into. If NULL, not copying.
+  // start_idx: Start index of the range to copy.
+  // end_idx: End index of the range to copy.
+  // Returns the number of coefficients copied. if host_memory is NULL, returns number of coefficients.
+  int64_t CONCAT_EXPAND(FIELD, polynomial_coeffs_to_host)(
+    PolynomialInst* p, scalar_t* host_memory, uint64_t start_idx, uint64_t end_idx)
+  {
+    return p->copy_coefficients_to_host(host_memory, start_idx, end_idx);
+  }
+
+  // Retrieves a device-memory view of the polynomial coefficients.
+  // p: Pointer to the polynomial instance.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the coefficients view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_coeff_view)(
+    PolynomialInst* p, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [coeffs, _size, _device_id] = p->get_coefficients_view();
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(coeffs));
+  }
+
+  // Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
+  // p: Pointer to the polynomial instance.
+  // nof_evals: Number of evaluations.
+  // is_reversed: Whether the evaluations are in reversed order.
+  // size: Output parameter for the size of the view.
+  // device_id: Output parameter for the device ID.
+  // Returns a pointer to an integrity pointer encapsulating the evaluations view.
+  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
+    PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
+  {
+    auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
+    *size = _size;
+    *device_id = _device_id;
+    return new IntegrityPointer<scalar_t>(std::move(rou_evals));
+  }
+
+  // Reads the pointer from an integrity pointer.
+  // p: Pointer to the integrity pointer.
+  // Returns the raw pointer if still valid, otherwise NULL.
+  const scalar_t* CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_get)(IntegrityPointer<scalar_t>* p)
+  {
+    return p->get();
+  }
+
+  // Checks if an integrity pointer is still valid.
+  // p: Pointer to the integrity pointer.
+  // Returns true if the pointer is valid, false otherwise.
+  bool CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_is_valid)(IntegrityPointer<scalar_t>* p) { return p->isValid(); }
+
+  // Destroys an integrity pointer, freeing its resources.
+  // p: Pointer to the integrity pointer to destroy.
+  void CONCAT_EXPAND(FIELD, polynomial_intergrity_ptr_destroy)(IntegrityPointer<scalar_t>* p) { delete p; }
+
+  } // extern "C"
+
+} // namespace polynomials
--- a/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bn254_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bn254_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
--- a/icicle/include/poseidon/constants/constants_template.h
+++ b/icicle/include/poseidon/constants/constants_template.h
@@ -0,0 +1,39 @@
+#pragma once
+#ifndef CURVE_POSEIDON_H
+#define CURVE_POSEIDON_H
+
+namespace poseidon_constants_curve {
+  /**
+   * This inner namespace contains optimized constants for running Poseidon.
+   * These constants were generated using an algorithm defined at
+   * https://spec.filecoin.io/algorithms/crypto/poseidon/
+   * The number in the name corresponds to the arity of hash function
+   * Each array contains:
+   * RoundConstants | MDSMatrix | Non-sparse matrix | Sparse matrices
+  */
+
+  int partial_rounds_2 = 0;
+
+  int partial_rounds_4 = 0;
+
+  int partial_rounds_8 = 0;
+
+  int partial_rounds_11 = 0;
+
+    unsigned char poseidon_constants_2[] = {
+        0x00
+    };
+
+    unsigned char poseidon_constants_4[] = {
+        0x00
+    };
+
+    unsigned char poseidon_constants_8[] = {
+        0x00
+    };
+
+    unsigned char poseidon_constants_11[] = {
+        0x00
+    };
+} // namespace poseidon_constants
+#endif
--- a/Show More
+++ b/Show More