Release v1.5.0 (#393 )

# Contents of this release Examples: multi-gpu example #381 Examples: updates example compares Radix2 and MixedRadix NTTs #383 Feat: add vector operations bindings to Rust #384 Examples: update examples with new vec ops #388 Feat: Grumpkin curve implementation #379 Feat: mixed-radix NTT fast twiddles mode #382 Docs: Update README.md #385 #387 README: Update Hall of Fame section #394 Examples: add rust poseidon example #392 Feat: GoLang bindings for v1.x #386
GoLang bindings for v1.x (#386 )
2026-01-13 01:17:57 -05:00 · 2024-02-23 10:15:18 +02:00 · 2024-02-22 20:52:48 +02:00 · 2024-02-22 19:47:40 +02:00 · 2024-02-22 19:39:58 +02:00 · 2024-02-22 11:39:15 +07:00
438 changed files with 87681 additions and 16743 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,39 @@
+Language: Cpp
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveMacros: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterFunction: true
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: true
+ColumnLimit: 120
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DisableFormat: false
+IndentFunctionDeclarationAfterType: false
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: false
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: All
+PointerAlignment: Left
+SortIncludes: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+Standard: c++17
+UseTab: Never
--- a/.codespellignore
+++ b/.codespellignore
@@ -0,0 +1,5 @@
+inout
+crate
+lmit
+mut
+uint
--- a/.github/ISSUE_TEMPLATE/bug_issue.md
+++ b/.github/ISSUE_TEMPLATE/bug_issue.md
@@ -2,7 +2,7 @@
 name: ":bug: Bug Report"
 about: Create a bug report to help us improve the repo
 title: "[BUG]: "
-labels: bug
+labels: type:bug
 ---

 ## Description
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,7 +2,7 @@
 name: ":sparkles: Feature Request"
 about: Request the inclusion of a new feature or functionality
 title: "[FEAT]: "
-labels: enhancement
+labels: type:feature
 ---

 ## Description
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -0,0 +1,15 @@
+golang:
+  - wrappers/golang/**/*.go'
+  - wrappers/golang/**/*.h'
+  - wrappers/golang/**/*.tmpl'
+  - go.mod
+rust:
+  - wrappers/rust
+cpp:
+  - icicle/**/*.cu
+  - icicle/**/*.cuh
+  - icicle/**/*.cpp
+  - icicle/**/*.hpp
+  - icicle/**/*.c
+  - icicle/**/*.h
+  - icicle/CMakeLists.txt
--- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -4,4 +4,4 @@ This PR...

 ## Linked Issues

-Closes #
+Resolves #
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,46 +0,0 @@
-name: Build
-
-on: 
-  pull_request:
-    branches: [ "main" ]
-    paths:
-      - "icicle/**"
-      - "src/**"
-      - "Cargo.toml"
-      - "build.rs"
-
-env:
-  CARGO_TERM_COLOR: always
-  ARCH_TYPE: sm_70
-
-jobs:
-  build-linux:
-    runs-on: ubuntu-latest
-
-    steps:
-    # Checkout code
-    - uses: actions/checkout@v3
-    # Download (or from cache) and install CUDA Toolkit 12.1.0
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-      # Build from cargo - Rust utils are preinstalled on latest images
-      # https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
-    - name: Build
-      run: cargo build --release --verbose
-      
-  
-  build-windows:
-    runs-on: windows-latest
-
-    steps:     
-    - uses: actions/checkout@v3
-    - uses: Jimver/cuda-toolkit@v0.2.9
-      id: cuda-toolkit
-      with:
-        cuda: '12.1.0'
-        use-github-cache: true
-    - name: Build
-      run: cargo build --release --verbose
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,20 @@
+name: Check Spelling
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+jobs:
+  spelling-checker:
+    name: Check Spelling
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: codespell-project/actions-codespell@v2
+        with:
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
+          skip: ./**/target,./**/build
+          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-ignore_words_file
+          ignore_words_file: .codespellignore
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -0,0 +1,54 @@
+# This workflow is a demo of how to run all examples in the Icicle repository.
+# For each language directory (c++, Rust, etc.) the workflow 
+#   (1) loops over all examples (msm, ntt, etc.) and 
+#   (2) runs ./compile.sh and ./run.sh in each directory.
+# The script ./compile.sh should compile the example and ./run.sh should run it.
+# Each script should return 0 for success and 1 otherwise.
+
+name: Examples
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:  
+  test-examples:
+    runs-on: [self-hosted, Linux, X64, icicle, examples]
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: c++ examples
+      working-directory: ./examples/c++
+      run: |
+        # loop over all directories in the current directory
+        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
+          if [ -d "$dir" ]; then
+            echo "Running command in $dir"
+            cd $dir
+            ./compile.sh
+            ./run.sh
+            cd -
+          fi
+        done    
+    - name: Rust examples
+      working-directory: ./examples/rust
+      run: |
+        # loop over all directories in the current directory
+        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
+          if [ -d "$dir" ]; then
+            echo "Running command in $dir"
+            cd $dir
+            cargo run --release
+            cd -
+          fi
+        done      
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -0,0 +1,119 @@
+name: Build
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  build-rust-linux:
+    name: Build Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build Rust
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-rust-windows:
+    name: Build Rust on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build Rust Targets
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/golang
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh ${{ matrix.curve }} ON
+
+  # TODO: Add once Golang make file supports building for Windows
+  # build-golang-windows:
+  #   name: Build Golang on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust"]'
+  #   - name: Build cpp libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -0,0 +1,47 @@
+name: Format
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  formatting-rust:
+    name: Check Rust Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check rustfmt
+      working-directory: ./wrappers/rust
+      # "-name tagret -prune" removes searching in any directory named "target"
+      # Formatting by single file is necessary due to generated files not being present
+      # before building the project.
+      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
+      # causing rustfmt to fail.
+      run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
+    # - name: Check clippy
+    #   run: cargo clippy --no-deps --all-features --all-targets
+
+  formatting-golang:
+    name: Check Golang Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check gofmt
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  formatting-cpp-cuda:
+    name: Check C++/CUDA Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check clang-format
+      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -0,0 +1,99 @@
+name: Test
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  test-rust-linux:
+    name: Test Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Run Rust Tests
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Running tests from the root workspace will run all workspace members' tests by default
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+
+  test-cpp-linux:
+    name: Test C++ on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build C++
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build
+        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
+        cmake --build build
+    - name: Run C++ Tests
+      working-directory: ./icicle/build
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+  
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    # strategy:
+    #   matrix:
+    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # builds all curves with g2 ON
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh all ON
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -timeout 60m
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,9 @@
 *.cubin
 *.bin
 *.fatbin
+*.so
+*.nsys-rep
+*.ncu-rep
 **/target
 **/.vscode
 **/.*lock*csv#
@@ -12,3 +15,7 @@
 **/.DS_Store
 **/Cargo.lock
 **/icicle/build/
+**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
+**/build
+**/icicle/appUtils/large_ntt/work
+icicle/appUtils/large_ntt/work/test_ntt
--- a/.rustfmt.toml
+++ b/.rustfmt.toml
@@ -0,0 +1,10 @@
+# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md
+
+# Stable Configs
+chain_width = 0
+max_width = 120
+merge_derives = true
+use_field_init_shorthand = true
+use_try_shorthand = true
+
+# Unstable Configs
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Ingonyama"
+title: "ICICLE: GPU Library for ZK Acceleration"
+version: 1.0.0
+date-released: 2024-01-04
+url: "https://github.com/ingonyama-zk/icicle"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,45 +0,0 @@
-[package]
-name = "icicle-utils"
-version = "0.1.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-description = "An implementation of the Ingonyama CUDA Library"
-homepage = "https://www.ingonyama.com"
-repository = "https://github.com/ingonyama-zk/icicle"
-
-[[bench]]
-name = "ntt"
-path = "benches/ntt.rs"
-harness = false
-
-[[bench]]
-name = "msm"
-path = "benches/msm.rs"
-harness = false
-
-[dependencies] 
-hex = "*"
-ark-std = "0.3.0"
-ark-ff = "0.3.0"
-ark-poly = "0.3.0"
-ark-ec = { version = "0.3.0", features = [ "parallel" ] }
-ark-bls12-381 = "0.3.0"
-ark-bls12-377 = "0.3.0"
-ark-bn254 = "0.3.0"
-
-rustacuda = "0.1"
-rustacuda_core = "0.1"
-rustacuda_derive = "0.1"
-
-rand = "*" #TODO: move rand and ark dependencies to dev once random scalar/point generation is done "natively"
-
-[build-dependencies]
-cc = { version = "1.0", features = ["parallel"] }
-
-[dev-dependencies]
-"criterion" = "0.4.0"
-
-[features]
-default = ["bls12_381"]
-bls12_381 = ["ark-bls12-381/curve"]
-g2 = []
--- a/28
+++ b/28
@@ -0,0 +1,28 @@
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Golang
+ENV GOLANG_VERSION 1.21.1
+RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -1,150 +1,125 @@
 # ICICLE
- <div align="center">Icicle is a library for ZK acceleration using CUDA-enabled GPUs.</div>

-                  
-![image (4)](https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png)
+**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**
+
+<p align="center">
+  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
+</p>
+<p align="center">
+  <a href="https://discord.gg/EVVXTdt6DF">
+    <img src="https://img.shields.io/discord/1063033227788423299?logo=discord" alt="Chat with us on Discord">
+  </a>
+  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
+    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
+  </a>
+  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
+</p>

 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.

-If we want ZK hardware today we have FPGAs or GPUs which are relatively inexpensive. However, the biggest selling point of GPUs is the software; we talk in particular about CUDA, which makes it easy to write code running on Nvidia GPUs, taking advantage of their highly parallel architecture. Together with the widespread availability of these devices, if we can get GPUs to work on ZK workloads, then we have made a giant step towards accessible and efficient ZK provers.
+We believe GPUs are as important for ZK as for AI.

-## Zero Knowledge on GPU
+- GPUs are a perfect match for ZK compute - around 97% of ZK protocol runtime is parallel by nature.
+- GPUs are simple for developers to use and scale compared to other hardware platforms.
+- GPUs are extremely competitive in terms of power / performance and price (3x cheaper).
+- GPUs are popular and readily available.

-ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE currently provides support for MSM, NTT, and ECNTT, with plans to support Hash functions soon.
+## Getting Started

-### Supported primitives
+ICICLE is a CUDA implementation of general functions widely used in ZKP.

- Fields
-    - Scalars
-    - Points
-        - Projective: {x, y, z}
-        - Affine: {x, y}
- Curves
-    - [BLS12-381]
-    - [BLS12-377]
-    - [BN254]
+> [!NOTE]
+> Developers: We highly recommend reading our [documentation]

-## Build and usage
+> [!TIP]
+> Try out ICICLE by running some [examples] using ICICLE in C++ and our Rust bindings 

-> NOTE: [NVCC] is a prerequisite for building.
+### Prerequisites

-1. Define or select a curve for your application; we've provided a [template][CRV_TEMPLATE] for defining a curve
-2. Include the curve in [`curve_config.cuh`][CRV_CONFIG]
-3. Now you can build the ICICLE library using nvcc
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 12.0 or newer.
+- [CMake]((https://cmake.org/files/)), version 3.18 and above. Latest version is recommended.
+- [GCC](https://gcc.gnu.org/install/download.html) version 9, latest version is recommended.
+- Any Nvidia GPU (which supports CUDA Toolkit version 12.0 or above).

-```sh
-mkdir -p build
-nvcc -o build/<ENTER_DIR_NAME> ./icicle/appUtils/ntt/ntt.cu ./icicle/appUtils/msm/msm.cu ./icicle/appUtils/vector_manipulation/ve_mod_mult.cu ./icicle/primitives/projective.cu -lib -arch=native
-```
+> [!NOTE]
+> It is possible to use CUDA 11 for cards which don't support CUDA 12, however we don't officially support this version and in the future there may be issues.

-### Testing the CUDA code
+### Accessing Hardware

-We are using [googletest] library for testing. To build and run [the test suite](./icicle/README.md) for finite field and elliptic curve arithmetic, run from the `icicle` folder:
+If you don't have access to a Nvidia GPU we have some options for you. 

-```sh
-mkdir -p build
-cmake -S . -B build
-cmake --build build
-cd build && ctest
-```
+Checkout [Google Colab](https://colab.google/). Google Colab offers a free [T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/) instance and ICICLE can be used with it, reference this guide for setting up your [Google Colab workplace][GOOGLE-COLAB-ICICLE].

-### Rust Bindings
+If you require more compute and have an interesting research project, we have [bounty and grant programs][GRANT_PROGRAM].

-For convenience, we also provide rust bindings to the ICICLE library for the following primitives:

- MSM
- NTT
-    - Forward NTT
-    - Inverse NTT
- ECNTT
-    - Forward ECNTT
-    - Inverse NTT
- Scalar Vector Multiplication
- Point Vector Multiplication
+### Build systems

-A custom [build script][B_SCRIPT] is used to compile and link the ICICLE library. The environement variable `ARCH_TYPE` is used to determine which GPU type the library should be compiled for and it defaults to `native` when it is not set allowing the compiler to detect the installed GPU type.
+ICICLE has three build systems.

-> NOTE: A GPU must be detectable and therefore installed if the `ARCH_TYPE` is not set.
+- [ICICLE core][ICICLE-CORE], C++ and CUDA
+- [ICICLE Rust][ICICLE-RUST] bindings, requires [Rust](https://www.rust-lang.org/) version 1.70 and above
+- [ICICLE Golang][ICICLE-GO] bindings, requires [Go](https://go.dev/) version 1.20 and above

-Once you have your parameters set, run:
+ICICLE core always needs to be built as part of the other build systems as it contains the core ICICLE primitives implemented in CUDA. Reference these guides for the different build systems, [ICICLE core guide][ICICLE-CORE-README], [ICICLE Rust guide][ICICLE-RUST-README] and [ICICLE Golang guide][ICICLE-GO-README].

-```sh
-cargo build --release
-```
+### Compiling ICICLE

-You'll find a release ready library at `target/release/libicicle_utils.rlib`.
+Running ICICLE via Rust bindings is highly recommended and simple:
+- Clone this repo
+  - go to our [Rust bindings][ICICLE-RUST]
+  - Enter a [curve](./wrappers/rust/icicle-curves) implementation
+  - run `cargo build --release` to build or `cargo test -- --test-threads=1` to build and execute tests

-To benchmark and test the functionality available in RUST, run:
+In any case you would want to compile and run core icicle c++ tests, just follow these setps:
+- Clone this repo
+  - go to [ICICLE core][ICICLE-CORE]
+  - execute the small [script](https://github.com/ingonyama-zk/icicle/tree/main/icicle#running-tests) to compile via cmake and run c++ and cuda tests
+
+## Docker
+
+We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.

 ```
-cargo bench
-cargo test -- --test-threads=1
+docker build -t <name_of_your_choice> .
+docker run --gpus all -it <name_of_your_choice> /bin/bash
 ```

-The flag `--test-threads=1` is needed because currently some tests might interfere with one another inside the GPU.
-
-### Example Usage
-
-An example of using the Rust bindings library can be found in our [fast-danksharding implementation][FDI]
-
-### Supporting Additional Curves
-
-Supporting additional curves can be done as follows:
-
-Create a JSON file with the curve parameters. The curve is defined by the following parameters: 
- ``curve_name`` - e.g. ``bls12_381``.
- ``modolus_p`` - scalar field modolus (in decimal).
- ``bit_count_p`` - number of bits needed to represent `` modolus_p`` .
- ``limb_p`` - number of bytes needed to represent `` modolus_p``  (rounded).
- ``ntt_size`` - log of the maximal size subgroup of the scalar field.    
- ``modolus_q`` - base field modulus (in decimal).
- ``bit_count_q`` - number of bits needed to represent `` modolus_q`` .
- ``limb_q`` number of bytes needed to represent `` modolus_p``  (rounded).
- ``weierstrass_b`` - Weierstrauss constant of the curve. 
- ``gen_x`` - x-value of a generator element for the curve. 
- ``gen_y`` - y-value of a generator element for the curve.
-
-Here's an example for BLS12-381.
-```
-{
-    "curve_name" : "bls12_381", 
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
-    "bit_count_p" : 255,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
-    "bit_count_q" : 381,
-    "limb_q" : 12,
-    "weierstrass_b" : 4,
-    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
-}
-```
-
-Save the parameters JSON file in ``curve_parameters``.
-
-Then run the Python script ``new_curve_script.py `` from the main icicle folder:
-
-```
-python3 ./curve_parameters/new_curve_script_rust.py ./curve_parameters/bls12_381.json
-```
-
-The script does the following:
- Creates a folder in ``icicle/curves`` with the curve name, which contains all of the files needed for the supported operations in cuda.
- Adds the curve exported operations to ``icicle/curves/index.cu``. 
- Creates a file with the curve name in ``src/curves`` with the relevant objects for the curve. 
- Creates a test file with the curve name in ``src``. 
-
-Testing the new curve could be done by running the tests in ``tests_curve_name`` (e.g. ``tests_bls12_381``).
 ## Contributions

-Join our [Discord Server](https://discord.gg/Y4SkbDf2Ff) and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+
+### Development Contributions
+
+If you are changing code, please make sure to change your [git hooks path][HOOKS_DOCS] to the repo's [hooks directory][HOOKS_PATH] by running the following command:
+
+```sh
+git config core.hooksPath ./scripts/hooks
+```
+
+In case `clang-format` is missing on your system, you can install it  using the following command:
+
+```sh
+sudo apt install clang-format
+```
+
+You will also need to install [codespell](https://github.com/codespell-project/codespell?tab=readme-ov-file#installation) to check for typos.
+
+This will ensure our custom hooks are run and will make it easier to follow our coding guidelines.

 ### Hall of Fame

- [Robik](https://github.com/robik75), for his on-going support and mentorship 
+- [Robik](https://github.com/robik75), for his ongoing support and mentorship
+- [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
+- [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
+
+## Help & Support
+
+For help and support talk to our devs in our discord channel ["ICICLE"](https://discord.gg/EVVXTdt6DF) 
+

 ## License

@@ -153,13 +128,26 @@ ICICLE is distributed under the terms of the MIT License.
 See [LICENSE-MIT][LMIT] for details.

 <!-- Begin Links -->
-[BLS12-381]: ./icicle/curves/bls12_381.cuh
+[BLS12-381]: ./icicle/curves/
+[BLS12-377]: ./icicle/curves/
+[BN254]: ./icicle/curves/
+[BW6-671]: ./icicle/curves/
 [NVCC]: https://docs.nvidia.com/cuda/#installation-guides
-[CRV_TEMPLATE]: ./icicle/curves/curve_template.cuh
-[CRV_CONFIG]: ./icicle/curves/curve_config.cuh
-[B_SCRIPT]: ./build.rs
-[FDI]: https://github.com/ingonyama-zk/fast-danksharding
 [LMIT]: ./LICENSE
+[DISCORD]: https://discord.gg/Y4SkbDf2Ff
 [googletest]: https://github.com/google/googletest/
+[HOOKS_DOCS]: https://git-scm.com/docs/githooks
+[HOOKS_PATH]: ./scripts/hooks/
+[CMAKELISTS]: https://github.com/ingonyama-zk/icicle/blob/f0e6b465611227b858ec4590f4de5432e892748d/icicle/CMakeLists.txt#L28
+[GOOGLE-COLAB-ICICLE]: https://dev.ingonyama.com/icicle/colab-instructions
+[GRANT_PROGRAM]: https://medium.com/@ingonyama/icicle-for-researchers-grants-challenges-9be1f040998e
+[ICICLE-CORE]: ./icicle/
+[ICICLE-RUST]: ./wrappers/rust/
+[ICICLE-GO]: ./wrappers/golang/
+[ICICLE-CORE-README]: ./icicle/README.md
+[ICICLE-RUST-README]: ./wrappers/rust/README.md
+[ICICLE-GO-README]: ./wrappers/golang/README.md
+[documentation]: https://dev.ingonyama.com/icicle/overview
+[examples]: ./examples/

 <!-- End Links -->
--- a/benches/msm.rs
+++ b/benches/msm.rs
@@ -1,50 +0,0 @@
-extern crate criterion;
-
-use criterion::{criterion_group, criterion_main, Criterion};
-
-use icicle_utils::{set_up_scalars, generate_random_points, commit_batch, get_rng, field::BaseField};
-#[cfg(feature = "g2")]
-use icicle_utils::{commit_batch_g2, field::ExtensionField};
-
-use rustacuda::prelude::*;
-
-
-const LOG_MSM_SIZES: [usize; 1] = [12];
-const BATCH_SIZES: [usize; 2] = [128, 256];
-
-fn bench_msm(c: &mut Criterion) {
-    let mut group = c.benchmark_group("MSM");
-    for log_msm_size in LOG_MSM_SIZES {
-        for batch_size in BATCH_SIZES {
-            let msm_size = 1 << log_msm_size;
-            let (scalars, _, _) = set_up_scalars(msm_size, 0, false);
-            let batch_scalars = vec![scalars; batch_size].concat();
-            let mut d_scalars = DeviceBuffer::from_slice(&batch_scalars[..]).unwrap();
-
-            let points = generate_random_points::<BaseField>(msm_size, get_rng(None));
-            let batch_points = vec![points; batch_size].concat();
-            let mut d_points = DeviceBuffer::from_slice(&batch_points[..]).unwrap();
-
-            #[cfg(feature = "g2")]
-            let g2_points = generate_random_points::<ExtensionField>(msm_size, get_rng(None));
-            #[cfg(feature = "g2")]
-            let g2_batch_points = vec![g2_points; batch_size].concat();
-            #[cfg(feature = "g2")]
-            let mut d_g2_points = DeviceBuffer::from_slice(&g2_batch_points[..]).unwrap();
-
-            group.sample_size(30).bench_function(
-                &format!("MSM of size 2^{} in batch {}", log_msm_size, batch_size),
-                |b| b.iter(|| commit_batch(&mut d_points, &mut d_scalars, batch_size))
-            );
-
-            #[cfg(feature = "g2")]
-            group.sample_size(10).bench_function(
-                &format!("G2 MSM of size 2^{} in batch {}", log_msm_size, batch_size),
-                |b| b.iter(|| commit_batch_g2(&mut d_g2_points, &mut d_scalars, batch_size))
-            );
-        }
-    }
-}
-
-criterion_group!(msm_benches, bench_msm);
-criterion_main!(msm_benches);
--- a/benches/ntt.rs
+++ b/benches/ntt.rs
@@ -1,33 +0,0 @@
-extern crate criterion;
-
-use criterion::{criterion_group, criterion_main, Criterion};
-
-use icicle_utils::{interpolate_scalars_batch, interpolate_points_batch, set_up_scalars, set_up_points};
-
-
-const LOG_NTT_SIZES: [usize; 1] = [15];
-const BATCH_SIZES: [usize; 2] = [8, 16];
-
-fn bench_ntt(c: &mut Criterion) {
-    let mut group = c.benchmark_group("NTT");
-    for log_ntt_size in LOG_NTT_SIZES {
-        for batch_size in BATCH_SIZES {
-            let ntt_size = 1 << log_ntt_size;
-            let (_, mut d_evals, mut d_domain) = set_up_scalars(ntt_size * batch_size, log_ntt_size, true);
-            let (_, mut d_points_evals, _) = set_up_points(ntt_size * batch_size, log_ntt_size, true);
-
-            group.sample_size(100).bench_function(
-                &format!("Scalar NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| interpolate_scalars_batch(&mut d_evals, &mut d_domain, batch_size))
-            );
-
-            group.sample_size(10).bench_function(
-                &format!("EC NTT of size 2^{} in batch {}", log_ntt_size, batch_size),
-                |b| b.iter(|| interpolate_points_batch(&mut d_points_evals, &mut d_domain, batch_size))
-            );
-        }
-    }
-}
-
-criterion_group!(ntt_benches, bench_ntt);
-criterion_main!(ntt_benches);
--- a/build.rs
+++ b/build.rs
@@ -1,29 +0,0 @@
-use std::env;
-
-fn main() {
-    //TODO: check cargo features selected
-    //TODO: can conflict/duplicate with make ?
-
-    println!("cargo:rerun-if-env-changed=CXXFLAGS");
-    println!("cargo:rerun-if-changed=./icicle");
-
-    let arch_type = env::var("ARCH_TYPE").unwrap_or(String::from("native"));
-
-    let mut arch = String::from("-arch=");
-    arch.push_str(&arch_type);
-
-    let mut nvcc = cc::Build::new();
-
-    println!("Compiling icicle library using arch: {}", &arch);
-
-    if cfg!(feature = "g2") {
-        nvcc.define("G2_DEFINED", None);
-    }
-    nvcc.cuda(true);
-    nvcc.debug(false);
-    nvcc.flag(&arch);
-    nvcc.files([
-        "./icicle/curves/index.cu",
-    ]);
-    nvcc.compile("ingo_icicle"); //TODO: extension??
-}
--- a/curve_parameters/bls12_377.json
+++ b/curve_parameters/bls12_377.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bls12_377",
-    "modolus_p" : 8444461749428370424248824938781546531375899335154063827935233455917409239041,
-    "bit_count_p" : 253,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177,
-    "bit_count_q" : 377,
-    "limb_q" : 12,
-    "weierstrass_b" : 1,
-    "gen_x" : 81937999373150964239938255573465948239988671502647976594219695644855304257327692006745978603320413799295628339695,
-    "gen_y" : 241266749859715473739788878240585681733927191168601896383759122102112907357779751001206799952863815012735208165030
-}
--- a/curve_parameters/bls12_381.json
+++ b/curve_parameters/bls12_381.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bls12_381",
-    "modolus_p" : 52435875175126190479447740508185965837690552500527637822603658699938581184513,
-    "bit_count_p" : 255,
-    "limb_p" :  8,
-    "ntt_size" : 32,
-    "modolus_q" : 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787,
-    "bit_count_q" : 381,
-    "limb_q" : 12,
-    "weierstrass_b" : 4,
-    "gen_x" : 3685416753713387016781088315183077757961620795782546409894578378688607592378376318836054947676345821548104185464507,
-    "gen_y" : 1339506544944476473020471379941921221584933875938349620426543736416511423956333506472724655353366534992391756441569
-}
--- a/curve_parameters/bn254.json
+++ b/curve_parameters/bn254.json
@@ -1,13 +0,0 @@
-{
-    "curve_name" : "bn254",
-    "modolus_p" : 21888242871839275222246405745257275088548364400416034343698204186575808495617,
-    "bit_count_p" : 254,
-    "limb_p" :  8,
-    "ntt_size" : 16,
-    "modolus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583,
-    "bit_count_q" : 254,
-    "limb_q" : 8,
-    "weierstrass_b" : 3,
-    "gen_x" : 1,
-    "gen_y" : 2
-}
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -1,203 +0,0 @@
-import json
-import math
-import os
-from sympy.ntheory import isprime, primitive_root
-import subprocess
-import random 
-import sys
-
-data = None
-with open(sys.argv[1]) as json_file:
-    data = json.load(json_file)
-
-curve_name = data["curve_name"]
-modolus_p = data["modolus_p"]
-bit_count_p = data["bit_count_p"]
-limb_p =  data["limb_p"]
-ntt_size = data["ntt_size"]
-modolus_q = data["modolus_q"]
-bit_count_q = data["bit_count_q"] 
-limb_q = data["limb_q"]
-weierstrass_b = data["weierstrass_b"]
-gen_x = data["gen_x"]
-gen_y = data["gen_y"]
-
-
-def to_hex(val, length):
-    x = str(hex(val))[2:]
-    if len(x) % 8 != 0:
-        x = "0" * (8-len(x) % 8) + x
-    if len(x) != length:
-        x = "0" * (length-len(x)) + x
-    n = 8
-    chunks = [x[i:i+n] for i in range(0, len(x), n)][::-1]
-    s = ""
-    for c in chunks:
-        s += "0x" + c + ", "
-    return s
-
-
-def get_root_of_unity(order: int) -> int:
-    assert (modolus_p - 1) % order == 0
-    return pow(5, (modolus_p - 1) // order, modolus_p)
-
-def create_field_parameters_struct(modulus, modulus_bits_count,limbs,ntt,size,name):
-    s = " struct "+name+"{\n"
-    s += "   static constexpr unsigned limbs_count = " + str(limbs)+";\n"
-    s += "   static constexpr storage<limbs_count> modulus = {"+to_hex(modulus,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<limbs_count> modulus_2 = {"+to_hex(modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<limbs_count> modulus_4 = {"+to_hex(modulus*4,8*limbs)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_wide = {"+to_hex(modulus,8*limbs*2)[:-2]+"};\n"
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared = {"+to_hex(modulus*modulus,8*limbs)[:-2]+"};\n"  
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_2 = {"+to_hex(modulus*modulus*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr storage<2*limbs_count> modulus_sqared_4 = {"+to_hex(modulus*modulus*2*2,8*limbs)[:-2]+"};\n"   
-    s += "   static constexpr unsigned modulus_bits_count = "+str(modulus_bits_count)+";\n"
-    m = int(math.floor(int(pow(2,2*modulus_bits_count) // modulus)))
-    s += "   static constexpr storage<limbs_count> m = {"+ to_hex(m,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> one = {"+ to_hex(1,8*limbs)[:-2] +"};\n"
-    s += "   static constexpr storage<limbs_count> zero = {"+ to_hex(0,8*limbs)[:-2] +"};\n"
-
-    if ntt:
-        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega"+str(k+1)+"= {"+ to_hex(omega,8*limbs)[:-2]+"};\n"
-        for k in range(size):
-            omega = get_root_of_unity(int(pow(2,k+1)))
-            s += "   static constexpr storage<limbs_count> omega_inv"+str(k+1)+"= {"+ to_hex(pow(omega, -1, modulus),8*limbs)[:-2]+"};\n"
-        for k in range(size):
-            s += "   static constexpr storage<limbs_count> inv"+str(k+1)+"= {"+ to_hex(pow(int(pow(2,k+1)), -1, modulus),8*limbs)[:-2]+"};\n"  
-    s+=" };\n"   
-    return s
-
-def create_gen():
-    s = " struct group_generator {\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_x = {"+to_hex(gen_x,8*limb_q)[:-2]+ "};\n"
-    s += "  static constexpr storage<fq_config::limbs_count> generator_y = {"+to_hex(gen_y,8*limb_q)[:-2]+ "};\n"
-    s+=" };\n" 
-    return s
-
-def get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b):
-    file_content = ""
-    file_content += "#pragma once\n#include \"../../utils/storage.cuh\"\n"
-    file_content += "namespace PARAMS_"+curve_name.upper()+"{\n"
-    file_content += create_field_parameters_struct(modolus_p,bit_count_p,limb_p,True,ntt_size,"fp_config")
-    file_content += create_field_parameters_struct(modolus_q,bit_count_q,limb_q,False,0,"fq_config")
-    file_content += " static constexpr unsigned weierstrass_b = " + str(weierstrass_b)+ ";\n"
-    file_content += create_gen()
-    file_content+="}\n"
-    return file_content
-
-
-# Create Cuda interface
-
-newpath = "./icicle/curves/"+curve_name 
-if not os.path.exists(newpath):
-    os.makedirs(newpath)
-
-fc = get_config_file_content(modolus_p, bit_count_p, limb_p, ntt_size, modolus_q, bit_count_q, limb_q, weierstrass_b)
-text_file = open("./icicle/curves/"+curve_name+"/params.cuh", "w")
-n = text_file.write(fc)
-text_file.close()
-
-with open("./icicle/curves/curve_template/lde.cu", "r") as lde_file:
-    content = lde_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle/curves/"+curve_name+"/lde.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open("./icicle/curves/curve_template/msm.cu", "r") as msm_file:
-    content = msm_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle/curves/"+curve_name+"/msm.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-
-with open("./icicle/curves/curve_template/ve_mod_mult.cu", "r") as ve_mod_mult_file:
-    content = ve_mod_mult_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./icicle/curves/"+curve_name+"/ve_mod_mult.cu", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-
-namespace = '#include "params.cuh"\n'+'''namespace CURVE_NAME_U {
-    typedef Field<PARAMS_CURVE_NAME_U::fp_config> scalar_field_t;\
-    typedef scalar_field_t scalar_t;\
-    typedef Field<PARAMS_CURVE_NAME_U::fq_config> point_field_t;
-    typedef Projective<point_field_t, scalar_field_t, PARAMS_CURVE_NAME_U::group_generator, PARAMS_CURVE_NAME_U::weierstrass_b> projective_t;
-    typedef Affine<point_field_t> affine_t;
-}'''
-
-with open('./icicle/curves/'+curve_name+'/curve_config.cuh', 'w') as f:
-    f.write(namespace.replace("CURVE_NAME_U",curve_name.upper()))
-    
-    
-eq = '''
-#include <cuda.h>\n
-#include "curve_config.cuh"\n
-#include "../../primitives/projective.cuh"\n
-extern "C" bool eq_CURVE_NAME_L(CURVE_NAME_U::projective_t *point1, CURVE_NAME_U::projective_t *point2)
-{
-    return (*point1 == *point2);
-}'''
-
-with open('./icicle/curves/'+curve_name+'/projective.cu', 'w') as f:
-    f.write(eq.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-
-supported_operations = '''
-#include "projective.cu"
-#include "lde.cu"
-#include "msm.cu"
-#include "ve_mod_mult.cu"
-'''
-
-with open('./icicle/curves/'+curve_name+'/supported_operations.cu', 'w') as f:
-    f.write(supported_operations.replace("CURVE_NAME_U",curve_name.upper()).replace("CURVE_NAME_L",curve_name.lower()))
-    
-with open('./icicle/curves/index.cu', 'a') as f:
-    f.write('\n#include "'+curve_name.lower()+'/supported_operations.cu"')
-    
-
-
-# Create Rust interface and tests
-
-if limb_p == limb_q: 
-    with open("./src/curve_templates/curve_same_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
-        n = text_file.write(content)
-        text_file.close()
-else:
-    with open("./src/curve_templates/curve_different_limbs.rs", "r") as curve_file:
-        content = curve_file.read()
-        content = content.replace("CURVE_NAME_U",curve_name.upper())
-        content = content.replace("CURVE_NAME_L",curve_name.lower())
-        content = content.replace("_limbs_p",str(limb_p * 8 * 4))
-        content = content.replace("limbs_p",str(limb_p))
-        content = content.replace("_limbs_q",str(limb_q * 8 * 4))
-        content = content.replace("limbs_q",str(limb_q))
-        text_file = open("./src/curves/"+curve_name+".rs", "w")
-        n = text_file.write(content)
-        text_file.close()
-
-with open("./src/curve_templates/test.rs", "r") as test_file:
-    content = test_file.read()
-    content = content.replace("CURVE_NAME_U",curve_name.upper())
-    content = content.replace("CURVE_NAME_L",curve_name.lower())
-    text_file = open("./src/test_"+curve_name+".rs", "w")
-    n = text_file.write(content)
-    text_file.close()
-    
-with open('./src/curves/mod.rs', 'a') as f:
-    f.write('\n pub mod ' + curve_name + ';')
-
-with open('./src/lib.rs', 'a') as f:
-    f.write('\npub mod ' + curve_name + ';')
--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -0,0 +1,23 @@
+# ZKContainer
+
+We recommend using [ZKContainer](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).
+
+In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`. 
+
+- File `Dockerfile` specifies how to build an image of a ZKContainer. 
+- File `devcontainer.json` enables running ZKContainer from Visual Studio Code.
+
+## Running ZKContainer from shell
+
+```sh
+docker build -t icicle-example-poseidon -f .devcontainer/Dockerfile .
+```
+
+To run the example interactively, start the container
+
+```sh
+docker run -it --rm --gpus all -v .:/icicle-example icicle-example-poseidon
+```
+
+Inside the container, run the commands for building the library for whichever [build system](../README.md#build-systems) you choose to use. 
--- a/examples/c++/msm/.devcontainer/Dockerfile
+++ b/examples/c++/msm/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /opt/icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/msm/.devcontainer/devcontainer.json
+++ b/examples/c++/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,21 @@
+{
+    "name": "Icicle Examples: msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python"
+			]
+		}
+	}
+}
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -0,0 +1,52 @@
+# Icicle example: Muli-Scalar Multiplication (MSM)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template function `MSM` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Concise Usage Explanation
+
+1. Select the curve
+2. Include an MSM template
+3. Configure MSM
+4. Call the template  
+
+```c++
+#define CURVE_ID 1
+#include "icicle/appUtils/msm/msm.cu"
+...
+msm::MSMConfig config = {...};
+...
+msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, size, config, &result);
+```
+
+In this example we use `BN254` curve (`CURVE_ID=1`). The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
+
+**Parameters:**
+
+The configuration is passed to the kernel as a structure of type `msm::MSMConfig`. Some of the most important fields are listed below:
+
+- `are_scalars_on_device`, `are_points_on_device`, `are_results_on_device`: location of the data
+
+- `is_async`: blocking vs. non-blocking kernel call
+
+- `large_bucket_factor`:  distinguishes between large bucket and normal bucket sizes. If there is a scalar distribution that is skewed heavily to a few values we can operate on those separately from the rest of the values. The ideal value here can vary by circuit (based on the distribution of scalars) but start with 10 and adjust to see if it improves performance.
+
+## Running the example
+
+- `cd` to your example directory
+- compile with  `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the parameters of MSM
+2. Generate random inputs on-host
+3. Configure and execute MSM using on-host data
+4. Copy inputs on-device
+5. Configure and execute MSM using on-device data
+6. Repeat the above steps for G2 points
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -0,0 +1,180 @@
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+#define G2_DEFINED
+#define CURVE_ID 1
+// include MSM template
+#include "appUtils/msm/msm.cu"
+using namespace curve_config;
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  int batch_size = 1;
+  std::cout << "Batch size: " << batch_size << std::endl;
+  unsigned msm_size = 1048576;
+  std::cout << "MSM size: " << msm_size << std::endl;
+  int N = batch_size * msm_size;
+
+  std::cout << "Part I: use G1 points" << std::endl;
+  
+  std::cout << "Generating random inputs on-host" << std::endl;
+  scalar_t* scalars = new scalar_t[N];
+  affine_t* points = new affine_t[N];
+  projective_t result;
+  scalar_t::RandHostMany(scalars, N);
+  projective_t::RandHostManyAffine(points, N);
+
+  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
+  // auto config = msm::DefaultMSMConfig();
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  msm::MSMConfig config = {
+    ctx,   // ctx
+    0,     // points_size
+    1,     // precompute_factor
+    0,     // c
+    0,     // bitsize
+    10,    // large_bucket_factor
+    1,     // batch_size
+    false, // are_scalars_on_device
+    false, // are_scalars_montgomery_form
+    false, // are_points_on_device
+    false, // are_points_montgomery_form
+    false, // are_results_on_device
+    false, // is_big_triangle
+    false, // is_async
+  };
+  config.batch_size = batch_size;
+  
+  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
+  // Create two events to time the MSM kernel
+  cudaStream_t stream = config.ctx.stream;
+  cudaEvent_t start, stop;
+  float time;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
+  // Execute the MSM kernel
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, msm_size, config, &result);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  // Print the result
+  std::cout << projective_t::to_affine(result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  scalar_t* scalars_d;
+  affine_t* points_d;
+  projective_t* result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&points_d, sizeof(affine_t) * N);
+  cudaMalloc(&result_d, sizeof(projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  // Create two events to time the MSM kernel
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
+  // Execute the MSM kernel
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars_d, points_d, msm_size, config, result_d);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  // Copy the result back to the host
+  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
+  // Print the result
+  std::cout << projective_t::to_affine(result) << std::endl;
+  // Free the device memory
+  cudaFree(scalars_d);
+  cudaFree(points_d);
+  cudaFree(result_d);
+  // Free the host memory, keep scalars for G2 example
+  delete[] points;
+
+  std::cout << "Part II: use G2 points" << std::endl;
+
+  std::cout << "Generating random inputs on-host" << std::endl;
+  // use the same scalars
+  g2_affine_t* g2_points = new g2_affine_t[N];
+  g2_projective_t::RandHostManyAffine(g2_points, N);
+
+  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
+  config.are_results_on_device = false;
+  config.are_scalars_on_device = false;
+  config.are_points_on_device = false;
+  g2_projective_t g2_result;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, g2_points, msm_size, config, &g2_result);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  std::cout << "Copying inputs on-device" << std::endl;
+  g2_affine_t* g2_points_d;
+  g2_projective_t* g2_result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);
+
+  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
+  config.are_results_on_device = true;
+  config.are_scalars_on_device = true;
+  config.are_points_on_device = true;
+
+  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars_d, g2_points_d, msm_size, config, g2_result_d);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;
+
+  cudaFree(scalars_d);
+  cudaFree(g2_points_d);
+  cudaFree(g2_result_d);
+  delete[] g2_points;
+  delete[] scalars;
+  cudaStreamDestroy(stream);
+  return 0;
+}
--- a/examples/c++/msm/run.sh
+++ b/examples/c++/msm/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -0,0 +1,52 @@
+# Icicle example: using multiple GPU to hash large dataset
+
+## Best-Practices
+
+This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+Use `device_context::DeviceContext` variable to select GPU to use. 
+Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
+
+## Concise Usage Explanation
+
+1. Include c++ threads
+
+```c++
+#include <thread>
+```
+
+2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
+
+```c++
+void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
+```
+
+3. Initialize device contexts for different GPUs
+
+```c++
+device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+ctx0.device_id=0;
+device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+ctx1.device_id=1;
+``` 
+
+4. Finally, spawn the threads and wait for their completion
+
+```c++
+std::thread thread0(threadPoseidon, ctx0, ...);
+std::thread thread1(threadPoseidon, ctx1, ...);
+thread0.join();
+thread1.join();
+```
+
+## What's in the example
+
+This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
+
+1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
+2. Hash two partitions in parallel on two GPUs
+3. Hash two partitions in series on one GPU
+4. Compare execution times
+
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -0,0 +1,148 @@
+#include <iostream>
+#include <thread>
+#include <chrono>
+
+#include <nvml.h>
+
+// select the curve
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"
+
+using namespace poseidon;
+using namespace curve_config;
+
+void checkCudaError(cudaError_t error) {
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        // Handle the error, e.g., exit the program or throw an exception.
+    }
+}
+
+// these global constants go into template calls
+const int size_col = 11;
+
+// this function executes the Poseidon thread
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    // CHK_IF_RETURN(); I can't use it in a standard thread function
+    PoseidonConfig column_config = {
+        ctx,   // ctx
+        false, // are_inputes_on_device
+        false, // are_outputs_on_device
+        false, // input_is_a_state
+        false, // aligned
+        false, // loop_state
+        false, // is_async
+        };
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
+    checkCudaError(err);
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
+int main() {
+    const unsigned size_row = (1<<30);
+    const unsigned nof_partitions = 64;
+    const unsigned size_partition = size_row / nof_partitions;
+    // layers is allocated only for one partition, need to reuse for different partitions
+    const uint32_t size_layers = size_col * size_partition;
+    
+    nvmlInit();
+    unsigned int deviceCount;
+    nvmlDeviceGetCount(&deviceCount);
+    std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+    for (unsigned int i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        nvmlMemory_t memory;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        nvmlDeviceGetHandleByIndex(i, &device);
+        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+        nvmlDeviceGetMemoryInfo(device, &memory);
+        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
+    }
+
+    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
+    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+    //===============================================================================
+    // Key: multiple devices are supported by device context
+    //===============================================================================
+
+    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+    ctx0.device_id=0;
+    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+    ctx1.device_id=1;
+    
+    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers0);
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers1);
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
+
+    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash0);
+    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash1);
+
+    PoseidonConstants<scalar_t> column_constants0, column_constants1;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
+
+    std::cout << "Parallel execution of Poseidon threads" << std::endl;
+    START_TIMER(parallel);
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+    // Wait for the threads to finish
+    thread0.join();
+    thread1.join();
+    END_TIMER(parallel,"2 GPUs");
+    std::cout << "Output Data from Thread 0: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 1: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    std::cout << "Sequential execution of Poseidon threads" << std::endl;
+    START_TIMER(sequential);
+    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    thread2.join();
+    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+    thread3.join();
+    END_TIMER(sequential,"1 GPU");
+    std::cout << "Output Data from Thread 2: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 3: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    nvmlShutdown();
+    return 0;
+}
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ b/examples/c++/multi-gpu-poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/multiply/.devcontainer/Dockerfile
+++ b/examples/c++/multiply/.devcontainer/Dockerfile
@@ -0,0 +1,23 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use NVIDIA base image
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    nsight-systems-12.2 \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/c++/multiply/.devcontainer/devcontainer.json
+++ b/examples/c++/multiply/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "name": "Icicle Examples - Multiply",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-example,type=bind",
+    "workspaceFolder": "/icicle-example",
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-azuretools.vscode-docker",
+				"ms-vscode.cpptools-extension-pack"
+			]
+		}
+	}
+}
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -0,0 +1,41 @@
+# Icicle example: Multiplication
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+
+## Concise Usage Explanation
+
+Define a `CURVE_ID` and include curve configuration header:
+
+```c++
+#define CURVE_ID 1
+#include "curves/curve_config.cuh"
+```
+
+The values of `CURVE_ID` for different curves are in the above header. Multiplication is accelerated both for field scalars and point fields.
+
+```c++
+using namespace curve_config;
+scalar_t a;
+point_field_t b;
+```
+
+## Running the example
+
+- `cd` to your example directory
+- compile with `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the parameters for the example such as vector size 
+2. Generate random vectors on-host
+3. Copy them on-device
+4. Execute element-wise vector multiplication on-device
+5. Copy results on-host
+
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <nvml.h>
+
+#define CURVE_ID 1
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/vec_ops.cu"
+
+using namespace curve_config;
+
+typedef scalar_t T;
+
+int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
+{
+  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
+  config.is_a_on_device = true;
+  config.is_b_on_device = true;
+  config.is_result_on_device = true;
+  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+
+int main(int argc, char** argv)
+{
+  const unsigned vector_size = 1 << 15;
+  const unsigned repetitions = 1 << 15;
+
+  cudaError_t err;
+  nvmlInit();
+  nvmlDevice_t device;
+  nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
+  std::cout << "Icicle-Examples: vector multiplications" << std::endl;
+  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
+    std::cout << "GPU Model: " << name << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU model name." << std::endl;
+  }
+  unsigned power_limit;
+  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
+
+  std::cout << "Vector size: " << vector_size << std::endl;
+  std::cout << "Repetitions: " << repetitions << std::endl;
+  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+
+  unsigned int baseline_power;
+  nvmlDeviceGetPowerUsage(device, &baseline_power);
+  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
+  unsigned baseline_temperature;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
+    std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // host data
+  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
+  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "Initializing vectors with random data" << std::endl;
+  T::RandHostMany(host_in1, vector_size);
+  T::RandHostMany(host_in2, vector_size);
+  // device data
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  T* device_in1;
+  T* device_in2;
+  T* device_out;
+
+  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  // copy from host to device
+  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  
+  std::cout << "Starting warm-up" << std::endl;
+  // Warm-up loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+  }
+
+  std::cout << "Starting benchmarking" << std::endl;
+  unsigned power_before;
+  nvmlDeviceGetPowerUsage(device, &power_before);
+  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
+            << " %" << std::endl;
+  unsigned temperature_before;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+  auto start_time = std::chrono::high_resolution_clock::now();
+  // Benchmark loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
+  unsigned power_after;
+  nvmlDeviceGetPowerUsage(device, &power_after);
+  std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
+            << " %" << std::endl;
+  unsigned temperature_after;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // Report performance in GMPS: Giga Multiplications Per Second
+  double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+  std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
+
+  // Optional: validate multiplication
+  T* host_out = (T*)malloc(vector_size * sizeof(T));
+
+  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+
+  // validate multiplication here...
+
+  // clean up and exit
+  free(host_in1); 
+  free(host_in2);
+  free(host_out);
+  cudaFree(device_in1);
+  cudaFree(device_in2);
+  cudaFree(device_out);
+  nvmlShutdown();
+  return 0;
+}
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/ntt/.devcontainer/Dockerfile
+++ b/examples/c++/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/ntt/.devcontainer/devcontainer.json
+++ b/examples/c++/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+		}
+	}
+}
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -0,0 +1,38 @@
+# Icicle example: Number-Theoretical Transform (NTT)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template function NTT for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform.
+
+## Concise Usage Explanation
+
+```c++
+// Select the curve
+#define CURVE_ID 1
+// Include NTT template
+#include "appUtils/ntt/ntt.cu"
+using namespace curve_config;
+using namespace ntt;
+// Configure NTT
+NTTConfig<S> config=DefaultNTTConfig<S>();
+// Call NTT
+NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+```
+
+## Running the example
+
+- `cd` to your example directory
+- compile with  `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the size of the example
+2. Initialize input
+3. Run Radix2 NTT
+4. Run MixedRadix NTT
+5. Validate the data output
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -0,0 +1,114 @@
+#include <chrono>
+#include <iostream>
+
+// select the curve
+#define CURVE_ID 1
+// include NTT template
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/ntt/kernel_ntt.cu"
+using namespace curve_config;
+using namespace ntt;
+
+// Operate on scalars
+typedef scalar_t S;
+typedef scalar_t E;
+
+void print_elements(const unsigned n, E* elements)
+{
+  for (unsigned i = 0; i < n; i++) {
+    std::cout << i << ": " << elements[i] << std::endl;
+  }
+}
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  // Lowest Harmonics
+  for (unsigned i = 0; i < ntt_size; i = i + 1) {
+    elements[i] = E::one();
+  }
+  // print_elements(ntt_size, elements );
+  // Highest Harmonics
+  for (unsigned i = 1 * ntt_size; i < 2 * ntt_size; i = i + 2) {
+    elements[i] = E::one();
+    elements[i + 1] = E::neg(scalar_t::one());
+  }
+  // print_elements(ntt_size, &elements[1*ntt_size] );
+}
+
+int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  int nof_errors = 0;
+  E amplitude = E::from((uint32_t)ntt_size);
+  // std::cout << "Amplitude: " << amplitude << std::endl;
+  // Lowest Harmonics
+  if (elements[0] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in lowest harmonics 0! " << std::endl;
+    // print_elements(ntt_size, elements );
+  } else {
+    std::cout << "Validated lowest harmonics" << std::endl;
+  }
+  // Highest Harmonics
+  if (elements[1 * ntt_size + ntt_size / 2] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in highest harmonics! " << std::endl;
+    // print_elements(ntt_size, &elements[1*ntt_size] );
+  } else {
+    std::cout << "Validated highest harmonics" << std::endl;
+  }
+  return nof_errors;
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  const unsigned log_ntt_size = 20;
+  std::cout << "Log2(NTT size): " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+  std::cout << "NTT size: " << ntt_size << std::endl;
+  const unsigned nof_ntts = 2;
+  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
+  const unsigned batch_size = nof_ntts * ntt_size;
+
+  std::cout << "Generating input data for lowest and highest harmonics" << std::endl;
+  E* input;
+  input = (E*)malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, input);
+  E* output;
+  output = (E*)malloc(sizeof(E) * batch_size);
+
+  std::cout << "Running NTT with on-host data" << std::endl;
+  // Create a device context
+  auto ctx = device_context::get_default_device_context();
+  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
+  InitDomain(basic_root, ctx);
+  // Create an NTTConfig instance
+  NTTConfig<S> config = DefaultNTTConfig<S>();
+  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  config.batch_size = nof_ntts;
+  START_TIMER(MixedRadix);
+  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(MixedRadix, "MixedRadix NTT");
+  
+  std::cout << "Validating output" << std::endl;
+  validate_output(ntt_size, nof_ntts, output);
+
+  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  START_TIMER(Radix2);
+  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
+  END_TIMER(Radix2, "Radix2 NTT");
+
+  std::cout << "Validating output" << std::endl;
+  validate_output(ntt_size, nof_ntts, output);
+
+  std::cout << "Cleaning-up memory" << std::endl;
+  free(input);
+  free(output);
+  return 0;
+}
--- a/examples/c++/ntt/run.sh
+++ b/examples/c++/ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
+++ b/examples/c++/polynomial_multiplication/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
+++ b/examples/c++/polynomial_multiplication/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: polynomial multiplication",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+        }
+    }
+}
--- a/examples/c++/polynomial_multiplication/CMakeLists.txt
+++ b/examples/c++/polynomial_multiplication/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/polynomial_multiplication/compile.sh
+++ b/examples/c++/polynomial_multiplication/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -0,0 +1,124 @@
+#define CURVE_ID BLS12_381
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "curves/curve_config.cuh"
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/ntt/kernel_ntt.cu"
+#include "utils/vec_ops.cu"
+#include "utils/error_handler.cuh"
+#include <memory>
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_data;
+
+void random_samples(test_data* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++)
+    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
+}
+
+void incremental_values(test_scalar* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++) {
+    res[i] = i ? res[i - 1] + test_scalar::one() * test_scalar::omega(4) : test_scalar::zero();
+  }
+}
+
+// calcaulting polynomial multiplication A*B via NTT,pointwise-multiplication and INTT
+// (1) allocate A,B on CPU. Randomize first half, zero second half
+// (2) allocate NttAGpu, NttBGpu on GPU
+// (3) calc NTT for A and for B from cpu to GPU
+// (4) multiply MulGpu = NttAGpu * NttBGpu (pointwise)
+// (5) INTT MulGpu inplace
+
+int main(int argc, char** argv)
+{
+  cudaEvent_t start, stop;
+  float measured_time;
+
+  int NTT_LOG_SIZE = 23;
+  int NTT_SIZE = 1 << NTT_LOG_SIZE;
+
+  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context
+
+  // init domain
+  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
+  const bool is_radix2_alg = (argc > 1) ? atoi(argv[1]) : false;
+  ntt_config.ntt_algorithm = is_radix2_alg ? ntt::NttAlgorithm::Radix2 : ntt::NttAlgorithm::MixedRadix;
+
+  const char* ntt_alg_str = is_radix2_alg ? "Radix-2" : "Mixed-Radix";
+  std::cout << "Polynomial multiplication with " << ntt_alg_str << " NTT: ";
+
+  CHK_IF_RETURN(cudaEventCreate(&start));
+  CHK_IF_RETURN(cudaEventCreate(&stop));
+
+  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  ntt::InitDomain(basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);
+
+  // (1) cpu allocation
+  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
+  auto CpuB = std::make_unique<test_data[]>(NTT_SIZE);
+  random_samples(CpuA.get(), NTT_SIZE >> 1); // second half zeros
+  random_samples(CpuB.get(), NTT_SIZE >> 1); // second half zeros
+
+  test_data *GpuA, *GpuB, *MulGpu;
+
+  auto benchmark = [&](bool print, int iterations = 1) {
+    // start recording
+    CHK_IF_RETURN(cudaEventRecord(start, ntt_config.ctx.stream));
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      // (2) gpu input allocation
+      CHK_IF_RETURN(cudaMallocAsync(&GpuA, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaMallocAsync(&GpuB, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+
+      // (3) NTT for A,B from cpu to gpu
+      ntt_config.are_inputs_on_device = false;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.ordering = ntt::Ordering::kNM;
+      CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
+      CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
+
+      // (4) multiply A,B
+      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
+      vec_ops::VecOpsConfig<test_data> config {
+        ntt_config.ctx,
+        true,  // is_a_on_device
+        true,  // is_b_on_device
+        true,  // is_result_on_device
+        false, // is_montgomery
+        false  // is_async
+      };
+      CHK_IF_RETURN(
+        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));
+
+      // (5) INTT (in place)
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.ordering = ntt::Ordering::kMN;
+      CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
+
+      CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(MulGpu, ntt_config.ctx.stream));
+    }
+
+    CHK_IF_RETURN(cudaEventRecord(stop, ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaEventElapsedTime(&measured_time, start, stop));
+
+    if (print) { std::cout << measured_time / iterations << " MS" << std::endl; }
+
+    return CHK_LAST();
+  };
+
+  benchmark(false); // warmup
+  benchmark(true, 20);
+
+  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+
+  return 0;
+}
--- a/examples/c++/polynomial_multiplication/run.sh
+++ b/examples/c++/polynomial_multiplication/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+./build/example 1 # radix2
+./build/example 0 # mixed-radix
--- a/examples/c++/poseidon/CMakeLists.txt
+++ b/examples/c++/poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/poseidon/README.md
+++ b/examples/c++/poseidon/README.md
@@ -0,0 +1,72 @@
+# Icicle example: build a Merkle tree using Poseidon hash
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template `poseidon_hash` to accelerate the popular [Poseidon hash function](https://www.poseidon-hash.info/).
+
+## Concise Usage Explanation
+
+```c++
+#include "appUtils/poseidon/poseidon.cu"
+...
+poseidon_hash<scalar_t, arity+1>(input, output, n, constants, config);
+```
+
+**Parameters:**
+
+- **`scalar_t`:** a scalar field of the selected curve.
+You can think of field's elements as 32-byte integers modulo `p`, where `p` is a prime number, specific to this field.
+
+- **arity:** number of elements in a hashed block.
+
+- **n:** number of blocks we hash in parallel.
+
+- **input, output:** `scalar_t` arrays of size $arity*n$ and $n$ respectively.
+
+- **constants:** are defined as below
+
+```c++
+device_context::DeviceContext ctx= device_context::get_default_device_context();
+PoseidonConstants<scalar_t> constants;
+init_optimized_poseidon_constants<scalar_t>(ctx, &constants);
+```
+
+## What's in the example
+
+1. Define the size of the example: the height of the full binary Merkle tree. 
+2. Hash blocks in parallel. The tree width determines the number of blocks to hash.
+3. Build a Merkle tree from the hashes.
+4. Use the tree to generate a membership proof for one of computed hashes.
+5. Validate the hash membership.
+6. Tamper the hash.
+7. Invalidate the membership of the tempered hash.
+
+## Details
+
+### Merkle tree structure
+
+Our Merkle tree is a **full binary tree** stored in a 1D array.
+The tree nodes are stored following a level-first traversal of the binary tree.
+For a given level, we use offset to number elements from left to right. The node numbers on the figure below correspond to their locations in the array.
+
+```text
+        Tree        Level
+          0         0 
+        /   \
+       1     2      1
+      / \   / \
+     3   4 5   6    2
+
+1D array representation: {0, 1, 2, 3, 4, 5, 6}
+```
+
+### Membership proof structure
+
+We use two arrays:
+
+- position (left/right) of the node along the path toward the root
+- hash of a second node with the same parent
--- a/examples/c++/poseidon/compile.sh
+++ b/examples/c++/poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -0,0 +1,152 @@
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+// select the curve
+#define CURVE_ID 2
+// include Poseidon template
+#include "appUtils/poseidon/poseidon.cu"
+using namespace poseidon;
+using namespace curve_config;
+
+device_context::DeviceContext ctx= device_context::get_default_device_context();
+
+// location of a tree node in the array for a given level and offset
+inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }
+
+// We assume the tree has leaves already set, compute all other levels
+void build_tree(
+  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t> * constants, PoseidonConfig config)
+{
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
+    const uint32_t next_level = level - 1;
+    const uint32_t next_level_width = 1 << next_level;
+    poseidon_hash<scalar_t, 2+1>(
+      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, *constants, config);
+  }
+}
+
+// linear search leaves for a given hash, return offset
+uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_height)
+{
+  const uint32_t tree_width = (1 << (tree_height - 1));
+  for (uint32_t i = 0; i < tree_width; i++) {
+    const scalar_t leaf = tree[tree_index(tree_height - 1, i)];
+    if (leaf == query) {
+      return i; // found the hash
+    }
+  }
+  return tree_height; // hash not found
+}
+
+void generate_proof(
+  uint32_t position,
+  scalar_t* tree,
+  const uint32_t tree_height,
+  uint32_t* proof_lr,
+  scalar_t* proof_hash)
+{
+  uint32_t level_index = position;
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
+    uint32_t lr;
+    uint32_t neighbour_index;
+    lr = level_index % 2;
+    if (lr == 0) {
+      // left
+      neighbour_index = level_index + 1;
+    } else {
+      // right
+      neighbour_index = level_index - 1;
+    }
+    proof_lr[level] = lr;
+    proof_hash[level] = tree[tree_index(level, neighbour_index)];
+    level_index /= 2;
+  }
+  // the proof must match this:
+  proof_hash[0] = tree[tree_index(0, 0)];
+}
+
+uint32_t validate_proof(
+  const scalar_t hash,
+  const uint32_t tree_height,
+  const uint32_t* proof_lr,
+  const scalar_t* proof_hash,
+  PoseidonConstants<scalar_t> * constants,
+  PoseidonConfig config)
+{
+  scalar_t hashes_in[2], hash_out[1], level_hash;
+  level_hash = hash;
+  for (uint32_t level = tree_height - 1; level > 0; level--) {
+    if (proof_lr[level] == 0) {
+      hashes_in[0] = level_hash;
+      hashes_in[1] = proof_hash[level];
+    } else {
+      hashes_in[0] = proof_hash[level];
+      hashes_in[1] = level_hash;
+    }
+    // next level hash
+    poseidon_hash<scalar_t, 2+1>(hashes_in, hash_out, 1, *constants, config);
+    level_hash = hash_out[0];
+  }
+  return proof_hash[0] == level_hash;
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
+  const uint32_t tree_height = 21;
+  std::cout << "Tree height: " << tree_height << std::endl;
+  const uint32_t tree_arity = 2;
+  const uint32_t leaf_level = tree_height - 1;
+  const uint32_t tree_width = 1 << leaf_level;
+  std::cout << "Tree width: " << tree_width << std::endl;
+  const uint32_t tree_size = (1 << tree_height) - 1;
+  std::cout << "Tree size: " << tree_size << std::endl;
+  scalar_t* tree = static_cast<scalar_t*>(malloc(tree_size * sizeof(scalar_t)));
+
+  std::cout << "2. Hashing blocks in parallel" << std::endl;
+  const uint32_t data_arity = 4;
+  std::cout << "Block size (arity): " << data_arity << std::endl;
+  std::cout << "Initializing blocks..." << std::endl;
+  scalar_t d = scalar_t::zero();
+  scalar_t* data = static_cast<scalar_t*>(malloc(tree_width * data_arity * sizeof(scalar_t)));
+  for (uint32_t i = 0; i < tree_width * data_arity; i++) {
+    data[i] = d;
+    d = d + scalar_t::one();
+  }
+  std::cout << "Hashing blocks into tree leaves..." << std::endl;
+  PoseidonConstants<scalar_t> constants;
+  init_optimized_poseidon_constants<scalar_t>(data_arity, ctx, &constants);
+  PoseidonConfig config = default_poseidon_config<scalar_t>(data_arity+1); 
+  poseidon_hash<curve_config::scalar_t, data_arity+1>(data, &tree[tree_index(leaf_level, 0)], tree_width, constants, config);
+
+  std::cout << "3. Building Merkle tree" << std::endl;
+  PoseidonConstants<scalar_t> tree_constants;
+  init_optimized_poseidon_constants<scalar_t>(tree_arity, ctx, &tree_constants);
+  PoseidonConfig tree_config = default_poseidon_config<scalar_t>(tree_arity+1);
+  build_tree(tree_height, tree, &tree_constants, tree_config);
+
+  std::cout << "4. Generate membership proof" << std::endl;
+  uint32_t position = tree_width - 1;
+  std::cout << "Using the hash for block: " << position << std::endl;
+  scalar_t query = tree[tree_index(leaf_level, position)];
+  uint32_t query_position = query_membership(query, tree, tree_height);
+  // allocate arrays for the proof
+  uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
+  scalar_t* proof_hash = static_cast<scalar_t*>(malloc(tree_height * sizeof(scalar_t)));
+  generate_proof(query_position, tree, tree_height, proof_lr, proof_hash);
+
+  std::cout << "5. Validate the hash membership" << std::endl;
+  uint32_t validated;
+  const scalar_t hash = tree[tree_index(leaf_level, query_position)];
+  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
+  std::cout << "Validated: " << validated << std::endl;
+
+  std::cout << "6. Tamper the hash" << std::endl;
+  const scalar_t tampered_hash = hash + scalar_t::one();
+  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
+  
+  std::cout << "7. Invalidate tamper hash membership" << std::endl;
+  std::cout << "Validated: " << validated << std::endl;
+  return 0;
+}
--- a/examples/c++/poseidon/run.sh
+++ b/examples/c++/poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example
--- a/examples/rust/msm/.devcontainer/Dockerfile
+++ b/examples/rust/msm/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/msm/.devcontainer/devcontainer.json
+++ b/examples/rust/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "msm"
+version = "1.2.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
+icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
+ark-bn254 = { version = "0.4.0", optional = true}
+ark-bls12-377 = { version = "0.4.0", optional = true}
+ark-ec = { version = "0.4.0", optional = true}
+clap = { version = "<=4.4.12", features = ["derive"] }
+
+[features]
+arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
+profile = []
+g2 = []
--- a/examples/rust/msm/README.md
+++ b/examples/rust/msm/README.md
@@ -0,0 +1,56 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Rust
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
+
+## Usage
+
+```rust
+msm(
+  /* Scalars input vector */ &scalars, 
+  /* Points input vector */ &points, 
+  /* MSMConfig reference */ &cfg, 
+  /* Projective point result */ &mut msm_results.as_slice()
+).unwrap();
+```
+In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+cargo run --release
+```
+
+You can add the `--feature arkworks,profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default sizes are 2^19 - 2^23. You can change this by passing the `--lower_bound_log_size <size> --upper_bound_log_size <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> cargo run --release -- -l 21 -u 24
+> ```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 10 ms | 11 ms | 21 ms | 39 ms | 77 ms |
+| Arkworks | 284 ms | 540 ms | 1,152 ms | 2,320 ms | 4,491 ms |
+
+### Single BLS12377 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 9 ms | 14 ms | 25 ms | 48 ms | 93 ms |
+| Arkworks | 490 ms | 918 ms | 1,861 ms | 3,624 ms | 7,191 ms |
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -0,0 +1,226 @@
+use icicle_bn254::curve::{CurveCfg, G1Projective, G2CurveCfg, G2Projective, ScalarCfg};
+
+use icicle_bls12_377::curve::{
+    CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
+};
+
+use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
+
+use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
+
+#[cfg(feature = "arkworks")]
+use icicle_core::traits::ArkConvertible;
+
+#[cfg(feature = "arkworks")]
+use ark_bls12_377::{Fr as Bls12377Fr, G1Affine as Bls12377G1Affine, G1Projective as Bls12377ArkG1Projective};
+#[cfg(feature = "arkworks")]
+use ark_bn254::{Fr as Bn254Fr, G1Affine as Bn254G1Affine, G1Projective as Bn254ArkG1Projective};
+#[cfg(feature = "arkworks")]
+use ark_ec::scalar_mul::variable_base::VariableBaseMSM;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Lower bound (inclusive) of MSM sizes to run for
+    #[arg(short, long, default_value_t = 19)]
+    lower_bound_log_size: u8,
+
+    /// Upper bound of MSM sizes to run for
+    #[arg(short, long, default_value_t = 22)]
+    upper_bound_log_size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    let lower_bound = args.lower_bound_log_size;
+    let upper_bound = args.upper_bound_log_size;
+    println!("Running Icicle Examples: Rust MSM");
+    let upper_size = 1 << (upper_bound);
+    println!("Generating random inputs on host for bn254...");
+    let upper_points = CurveCfg::generate_random_affine_points(upper_size);
+    let g2_upper_points = G2CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars = ScalarCfg::generate_random(upper_size);
+
+    println!("Generating random inputs on host for bls12377...");
+    let upper_points_bls12377 = BLS12377CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars_bls12377 = BLS12377ScalarCfg::generate_random(upper_size);
+
+    for i in lower_bound..=upper_bound {
+        let log_size = i;
+        let size = 1 << log_size;
+        println!(
+            "---------------------- MSM size 2^{}={} ------------------------",
+            log_size, size
+        );
+        // Setting Bn254 points and scalars
+        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
+        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+
+        // Setting bls12377 points and scalars
+        // let points_bls12377 = &upper_points_bls12377[..size];
+        let points_bls12377 = HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());
+
+        println!("Configuring bn254 MSM...");
+        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let stream = CudaStream::create().unwrap();
+        let g2_stream = CudaStream::create().unwrap();
+        let mut cfg = msm::MSMConfig::default();
+        let mut g2_cfg = msm::MSMConfig::default();
+        cfg.ctx
+            .stream = &stream;
+        g2_cfg
+            .ctx
+            .stream = &g2_stream;
+        cfg.is_async = true;
+        g2_cfg.is_async = true;
+
+        println!("Configuring bls12377 MSM...");
+        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> =
+            HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let stream_bls12377 = CudaStream::create().unwrap();
+        let mut cfg_bls12377 = msm::MSMConfig::default();
+        cfg_bls12377
+            .ctx
+            .stream = &stream_bls12377;
+        cfg_bls12377.is_async = true;
+
+        println!("Executing bn254 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+        #[cfg(feature = "profile")]
+        println!(
+            "ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
+            start
+                .elapsed()
+                .as_millis()
+        );
+        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+
+        println!("Executing bls12377 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(
+            &scalars_bls12377,
+            &points_bls12377,
+            &cfg_bls12377,
+            &mut msm_results_bls12377,
+        )
+        .unwrap();
+        #[cfg(feature = "profile")]
+        println!(
+            "ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms",
+            start
+                .elapsed()
+                .as_millis()
+        );
+
+        println!("Moving results to host..");
+        let mut msm_host_result = vec![G1Projective::zero(); 1];
+        let mut g2_msm_host_result = vec![G2Projective::zero(); 1];
+        let mut msm_host_result_bls12377 = vec![BLS12377G1Projective::zero(); 1];
+
+        stream
+            .synchronize()
+            .unwrap();
+        g2_stream
+            .synchronize()
+            .unwrap();
+        msm_results
+            .copy_to_host(&mut msm_host_result[..])
+            .unwrap();
+        g2_msm_results
+            .copy_to_host(&mut g2_msm_host_result[..])
+            .unwrap();
+        println!("bn254 result: {:#?}", msm_host_result);
+        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
+
+        stream_bls12377
+            .synchronize()
+            .unwrap();
+        msm_results_bls12377
+            .copy_to_host(&mut msm_host_result_bls12377[..])
+            .unwrap();
+        println!("bls12377 result: {:#?}", msm_host_result_bls12377);
+
+        #[cfg(feature = "arkworks")]
+        {
+            println!("Checking against arkworks...");
+            let ark_points: Vec<Bn254G1Affine> = points
+                .as_slice()
+                .iter()
+                .map(|&point| point.to_ark())
+                .collect();
+            let ark_scalars: Vec<Bn254Fr> = scalars
+                .as_slice()
+                .iter()
+                .map(|scalar| scalar.to_ark())
+                .collect();
+
+            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
+                .as_slice()
+                .iter()
+                .map(|point| point.to_ark())
+                .collect();
+            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
+                .as_slice()
+                .iter()
+                .map(|scalar| scalar.to_ark())
+                .collect();
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bn254_ark_msm_res = Bn254ArkG1Projective::msm(&ark_points, &ark_scalars).unwrap();
+            println!("Arkworks Bn254 result: {:#?}", bn254_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!(
+                "Ark BN254 MSM on size 2^{log_size} took: {} ms",
+                start
+                    .elapsed()
+                    .as_millis()
+            );
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bls12377_ark_msm_res =
+                Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
+            println!("Arkworks Bls12377 result: {:#?}", bls12377_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!(
+                "Ark BLS12377 MSM on size 2^{log_size} took: {} ms",
+                start
+                    .elapsed()
+                    .as_millis()
+            );
+
+            let bn254_icicle_msm_res_as_ark = msm_host_result[0].to_ark();
+            let bls12377_icicle_msm_res_as_ark = msm_host_result_bls12377[0].to_ark();
+
+            println!(
+                "Bn254 MSM is correct: {}",
+                bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark)
+            );
+            println!(
+                "Bls12377 MSM is correct: {}",
+                bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark)
+            );
+        }
+
+        println!("Cleaning up bn254...");
+        stream
+            .destroy()
+            .unwrap();
+        println!("Cleaning up bls12377...");
+        stream_bls12377
+            .destroy()
+            .unwrap();
+        println!("");
+    }
+}
--- a/examples/rust/ntt/.devcontainer/Dockerfile
+++ b/examples/rust/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/ntt/.devcontainer/devcontainer.json
+++ b/examples/rust/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "ntt"
+version = "1.2.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core", features = ["arkworks"] }
+icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["arkworks"] }
+icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377", features = ["arkworks"] }
+
+ark-ff = { version = "0.4.0" }
+ark-poly = "0.4.0"
+ark-std = "0.4.0"
+ark-bn254 = { version = "0.4.0" }
+ark-bls12-377 = { version = "0.4.0" }
+clap = { version = "<=4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/ntt/README.md
+++ b/examples/rust/ntt/README.md
@@ -0,0 +1,65 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Rust
+
+## Key-Takeaway
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer](../../ZKContainer.md).
+
+## Usage
+
+```rust
+ntt::ntt(
+  /* input slice */ scalars.as_slice(),
+  /* NTT Direction */ ntt::NTTDir::kForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ ntt_results.as_slice()
+).unwrap();
+```
+
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT.
+2. Generate random inputs on-host
+3. Set up the domain.
+4. Configure NTT
+5. Execute NTT on-device
+6. Move the result on host
+7. Compare results with arkworks
+
+Running the example:
+
+```sh
+cargo run --release
+```
+
+You can add the `--feature profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `--size <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+cargo run --release -- -s 23
+```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 NTT
+
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.263 ms | 2.986 ms | 4.651 ms | 9.308 ms | 18.618 ms |
+| Arkworks | 138 ms | 290 ms | 611 ms | 1,295 ms | 2,715 ms |
+
+### Single BLS12377 NTT
+
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.272 ms | 2.893 ms | 4.728 ms | 9.211 ms | 18.319 ms |
+| Arkworks | 135 ms | 286 ms | 605 ms | 1,279 ms | 2,682 ms |
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -0,0 +1,196 @@
+use icicle_bn254::curve::{ScalarCfg, ScalarField};
+
+use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};
+
+use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
+
+use icicle_core::{
+    ntt::{self, NTT},
+    traits::{FieldImpl, GenerateRandom},
+};
+
+use icicle_core::traits::ArkConvertible;
+
+use ark_bls12_377::Fr as Bls12377Fr;
+use ark_bn254::Fr as Bn254Fr;
+use ark_ff::FftField;
+use ark_poly::{EvaluationDomain, Radix2EvaluationDomain};
+use ark_std::cmp::{Ord, Ordering};
+use std::convert::TryInto;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of NTT to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    println!("Running Icicle Examples: Rust NTT");
+    let log_size = args.size;
+    let size = 1 << log_size;
+    println!(
+        "---------------------- NTT size 2^{}={} ------------------------",
+        log_size, size
+    );
+    // Setting Bn254 points and scalars
+    println!("Generating random inputs on host for bn254...");
+    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
+    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();
+
+    // Setting bls12377 points and scalars
+    println!("Generating random inputs on host for bls12377...");
+    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
+    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> =
+        HostOrDeviceSlice::cuda_malloc(size).unwrap();
+
+    println!("Setting up bn254 Domain...");
+    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
+        size.try_into()
+            .unwrap(),
+    )
+    .unwrap();
+    let ctx = DeviceContext::default();
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bn254 NTT...");
+    let stream = CudaStream::create().unwrap();
+    let mut cfg = ntt::NTTConfig::default();
+    cfg.ctx
+        .stream = &stream;
+    cfg.is_async = true;
+
+    println!("Setting up bls12377 Domain...");
+    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(
+        size.try_into()
+            .unwrap(),
+    )
+    .unwrap();
+    // reusing ctx from above
+    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bls12377 NTT...");
+    let stream_bls12377 = CudaStream::create().unwrap();
+    let mut cfg_bls12377 = ntt::NTTConfig::default();
+    cfg_bls12377
+        .ctx
+        .stream = &stream_bls12377;
+    cfg_bls12377.is_async = true;
+
+    println!("Executing bn254 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
+    #[cfg(feature = "profile")]
+    println!(
+        "ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
+        start
+            .elapsed()
+            .as_micros()
+    );
+
+    println!("Executing bls12377 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(
+        &scalars_bls12377,
+        ntt::NTTDir::kForward,
+        &cfg_bls12377,
+        &mut ntt_results_bls12377,
+    )
+    .unwrap();
+    #[cfg(feature = "profile")]
+    println!(
+        "ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs",
+        start
+            .elapsed()
+            .as_micros()
+    );
+
+    println!("Moving results to host..");
+    stream
+        .synchronize()
+        .unwrap();
+    let mut host_bn254_results = vec![ScalarField::zero(); size];
+    ntt_results
+        .copy_to_host(&mut host_bn254_results[..])
+        .unwrap();
+
+    stream_bls12377
+        .synchronize()
+        .unwrap();
+    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
+    ntt_results_bls12377
+        .copy_to_host(&mut host_bls12377_results[..])
+        .unwrap();
+
+    println!("Checking against arkworks...");
+    let mut ark_scalars: Vec<Bn254Fr> = scalars
+        .as_slice()
+        .iter()
+        .map(|scalar| scalar.to_ark())
+        .collect();
+    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
+
+    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
+        .as_slice()
+        .iter()
+        .map(|scalar| scalar.to_ark())
+        .collect();
+    let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
+
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bn254_domain.fft_in_place(&mut ark_scalars);
+    #[cfg(feature = "profile")]
+    println!(
+        "Ark BN254 NTT on size 2^{log_size} took: {} ms",
+        start
+            .elapsed()
+            .as_millis()
+    );
+
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bls12_377_domain.fft_in_place(&mut ark_scalars_bls12377);
+    #[cfg(feature = "profile")]
+    println!(
+        "Ark BLS12377 NTT on size 2^{log_size} took: {} ms",
+        start
+            .elapsed()
+            .as_millis()
+    );
+
+    host_bn254_results
+        .iter()
+        .zip(ark_scalars.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+    println!("Bn254 NTT is correct");
+
+    host_bls12377_results
+        .iter()
+        .zip(ark_scalars_bls12377.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+
+    println!("Bls12377 NTT is correct");
+
+    println!("Cleaning up bn254...");
+    stream
+        .destroy()
+        .unwrap();
+    println!("Cleaning up bls12377...");
+    stream_bls12377
+        .destroy()
+        .unwrap();
+    println!("");
+}
--- a/examples/rust/poseidon/.devcontainer/Dockerfile
+++ b/examples/rust/poseidon/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/poseidon/.devcontainer/devcontainer.json
+++ b/examples/rust/poseidon/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust poseidon",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/poseidon/Cargo.toml
+++ b/examples/rust/poseidon/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "poseidon"
+version = "1.4.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-bls12-381 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-381" }
+
+clap = { version = "<=4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -0,0 +1,53 @@
+use icicle_bls12_381::curve::ScalarField as F;
+
+use icicle_cuda_runtime::device_context::DeviceContext;
+
+use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
+use icicle_core::traits::FieldImpl;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of Poseidon input to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    let size = args.size;
+    let test_size = 1 << size;
+
+    println!("Running Icicle Examples: Rust Poseidon Hash");
+    let arity = 2u32;
+    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
+    let ctx = DeviceContext::default();
+    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
+    let config = PoseidonConfig::default();
+
+    println!("---------------------- Input size 2^{}={} ------------------------", size, test_size);
+    let inputs = vec![F::one(); test_size * arity as usize];
+    let outputs = vec![F::zero(); test_size];
+    let mut input_slice = HostOrDeviceSlice::on_host(inputs);
+    let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+
+    println!("Executing BLS12-381 Poseidon Hash on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    poseidon_hash_many::<F>(
+        &mut input_slice,
+        &mut output_slice,
+        test_size as u32,
+        arity as u32,
+        &constants,
+        &config,
+    )
+    .unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
+}
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,21 @@
+module github.com/ingonyama-zk/icicle
+
+go 1.20
+
+require (
+	github.com/consensys/gnark-crypto v0.12.1
+	github.com/stretchr/testify v1.8.2
+)
+
+require (
+	github.com/bits-and-blooms/bitset v1.7.0 // indirect
+	github.com/consensys/bavard v0.1.13 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/kr/text v0.2.0 // indirect
+	github.com/mmcloughlin/addchain v0.4.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/rogpeppe/go-internal v1.12.0 // indirect
+	golang.org/x/sys v0.9.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	rsc.io/tmplfunc v0.0.3 // indirect
+)
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,38 @@
+github.com/bits-and-blooms/bitset v1.7.0 h1:YjAGVd3XmtK9ktAbX8Zg2g2PwLIMjGREZJHlV4j7NEo=
+github.com/bits-and-blooms/bitset v1.7.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
+github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
+github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
+github.com/consensys/gnark-crypto v0.12.1 h1:lHH39WuuFgVHONRl3J0LRBtuYdQTumFSDtJF7HpyG8M=
+github.com/consensys/gnark-crypto v0.12.1/go.mod h1:v2Gy7L/4ZRosZ7Ivs+9SfUDr0f5UlG+EM5t7MPHiLuY=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
+github.com/mmcloughlin/addchain v0.4.0 h1:SobOdjm2xLj1KkXN5/n0xTIWyZA2+s99UCY1iPfkHRY=
+github.com/mmcloughlin/addchain v0.4.0/go.mod h1:A86O+tHqZLMNO4w6ZZ4FlVQEadcoqkyU72HC5wJ4RlU=
+github.com/mmcloughlin/profile v0.1.1/go.mod h1:IhHD7q1ooxgwTgjxQYkACGA77oFTDdFVejUS1/tS/qU=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
+github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
+github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
+golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
+rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA=
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,42 +1,152 @@
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.18)

 # GoogleTest requires at least C++14
 set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
+if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+  message(WARNING "Note that PIC (position-independent code) is disabled.")
+else()
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif()
+
 # add the target cuda architectures
 # each additional architecture increases the compilation time and output file size
-if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
+if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+  find_program(_nvidia_smi "nvidia-smi")
+
+  if(_nvidia_smi)
+    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+    # execute nvidia-smi -L to get a short list of GPUs available
+    exec_program(${_nvidia_smi_path} ARGS -L
+      OUTPUT_VARIABLE _nvidia_smi_out
+      RETURN_VALUE _nvidia_smi_ret)
+
+    # process the stdout of nvidia-smi
+    if(_nvidia_smi_ret EQUAL 0)
+      # convert string with newlines to list of strings
+      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+      foreach(_line ${_nvidia_smi_out})
+        if(_line MATCHES "^GPU [0-9]+:")
+          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+          # the UUID is not very useful for the user, remove it
+          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+          if(NOT _gpu_info STREQUAL "")
+            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+          endif()
+        endif()
+      endforeach()
+
+      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+    endif()
+  endif()
+
+  # ##
+  if(DETECT_GPU_COUNT GREATER 0)
+    set(CMAKE_CUDA_ARCHITECTURES native) # do native
+  else()
+    # no GPUs found, like on Github CI runners
+    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
+  endif()
+endif()
+
 project(icicle LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+include_directories("${CMAKE_SOURCE_DIR}")

-include(FetchContent)
-FetchContent_Declare(
-  googletest
-  URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
-)
-# For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)

-enable_testing()
+# when adding a new curve/field, append its name to the end of this list
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)

-add_executable(
-  primitives_test
-  primitives/test.cu
-)
-target_link_libraries(
-  primitives_test
-  GTest::gtest_main
-)
+set(IS_CURVE_SUPPORTED FALSE)
+set(I 0)
+foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
+  math(EXPR I "${I} + 1")
+  if (CURVE STREQUAL SUPPORTED_CURVE)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I}")
+    set(IS_CURVE_SUPPORTED TRUE)
+  endif ()
+endforeach()

-include(GoogleTest)
-set_target_properties(primitives_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if (NOT IS_CURVE_SUPPORTED)
+  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
+endif ()

-gtest_discover_tests(primitives_test)
+if (G2_DEFINED STREQUAL "ON")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
+endif ()
+
+option(BUILD_TESTS "Build tests" OFF)
+
+if (NOT BUILD_TESTS)
+
+  message(STATUS "Building without tests.")
+
+  if (CURVE IN_LIST SUPPORTED_CURVES_WITH_POSEIDON)
+    list(APPEND ICICLE_SOURCES appUtils/poseidon/poseidon.cu)
+    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
+  endif()
+
+  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
+  endif()
+
+  add_library(
+    icicle
+    utils/vec_ops.cu
+    utils/mont.cu
+    primitives/field.cu
+    primitives/projective.cu
+    appUtils/msm/msm.cu
+    ${ICICLE_SOURCES}
+  )
+  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
+  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  
+
+else()
+
+  message(STATUS "Building tests.")
+
+  include(FetchContent)
+  FetchContent_Declare(
+    googletest
+    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
+  )
+  # For Windows: Prevent overriding the parent project's compiler/linker settings
+
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+
+  enable_testing()
+
+  add_executable(
+    runner
+    tests/runner.cu
+  )
+
+  target_link_libraries(
+    runner
+    GTest::gtest_main
+  )
+
+  include(GoogleTest)
+  set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+  gtest_discover_tests(runner)
+
+endif ()
--- a/icicle/Doxyfile
+++ b/icicle/Doxyfile
--- a/icicle/README.md
+++ b/icicle/README.md
@@ -1,17 +1,15 @@
-# Tests
+# ICICLE CUDA
+
+## Running tests

 ```sh
-mkdir -p build; cmake -S . -B build; cmake --build build; cd build && ctest; cd ..
+mkdir -p build;
+cmake -DBUILD_TESTS=ON -DCURVE=<supported_curve> -S . -B build;
+cmake --build build;
+./build/runner --gtest_brief=1
 ```

-## Prerequisites on Ubuntu
-
-Before proceeding, make sure the following software installed:
-
-1. CMake at least version 3.16, which can be downloaded from [cmake.org](https://cmake.org/files/)
-   It is recommended to have the latest version installed.
-2. [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu) version 12.0 or newer.
-3. GCC - version 9 or newer recommended.
+The command above will build ICICLE Core and run the ctest.

 ## Troubleshooting

@@ -72,10 +70,38 @@ If the `update-alternatives` settings are broken, you can try to fix them with t

 If you encounter the error, check if the `$CUDA_HOME/bin/crt/link.stub` file is available.

-Othrewise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:
+Otherwise create a symlink. For example, if the CUDA toolkit is installed with apt-get to the default path, you can create a symlink with the following command:

 `ln -sf /usr/local/cuda-12.1/bin/crt/link.stub /usr/lib/nvidia-cuda-toolkit/bin/crt/link.stub`

 Alternatively, you can replace the old CUDA root with a symlink to the new CUDA installation with the following command:

 `ln -sf /usr/local/cuda-12.1/ /usr/lib/nvidia-cuda-toolkit/`
+
+### 8 - Error while loading shared libraries
+
+`cmake: error while loading shared libraries: libssl.so.10: cannot open shared object file: No such file or directory`
+
+Make sure `libssl` is installed.
+
+```sh
+sudo apt-get update
+sudo apt-get install libssl1.0.0 libssl-dev
+```
+
+### 9 - PIC and Linking against shared libraries
+
+Note that currently - ICICLE is static library with [PIC](https://en.wikipedia.org/wiki/Position-independent_code) enabled by default. You can disable it by setting either `ICICLE_PIC` environment variable to `OFF` or passing `-DICICLE_PIC=OFF` to CMake.
+
+## Running with Nix
+
+If you have Nix or NixOs installed on your machine, you can create a development shell to load all build dependencies and set the required environmental variables.
+
+From the ```/icicle/icicle```  directory run the following command.
+
+```sh
+nix-shell --pure cuda-shell.nix
+```
+
+This will install everything you need to build and run ICICLE Core.
+
--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
@@ -1,4 +1,4 @@
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -I. tests/msm_test.cu
+	nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
 	work/test_msm
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -1,22 +1,113 @@
+#pragma once
 #ifndef MSM_H
 #define MSM_H
-#pragma once

-template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device);
+#include <cuda_runtime.h>

-template <typename S, typename P, typename A>
-void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device);
+#include "../../curves/curve_config.cuh"
+#include "../../primitives/affine.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/device_context.cuh"
+#include "../../utils/error_handler.cuh"

-template <typename S, typename P, typename A>
-void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device);
+/**
+ * @namespace msm
+ * Multi-scalar-multiplication, or MSM, is the sum of products of the form:
+ * \f[
+ *  MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i
+ * \f]
+ * where \f$ \{P_i\} \f$ are elements of a certain group, \f$ \{s_i\} \f$ are scalars and \f$ N \f$ is the number of
+ * terms. In cryptographic applications, prime-order subgroups of elliptic curve groups are typically used, so we refer
+ * to group elements \f$ \{P_i\} \f$ as "points".
+ *
+ * To solve an MSM problem, we use an algorithm called the "bucket method". For a theoretical background on this
+ * algorithm, see [this](https://www.youtube.com/watch?v=Bl5mQA7UL2I) great talk by Gus Gutoski.
+ *
+ * This codebase is based on and evolved from Matter Labs'
+ * [Zprize
+ * submission](https://github.com/matter-labs/z-prize-msm-gpu/blob/main/bellman-cuda-rust/bellman-cuda-sys/native/msm.cu).
+ */
+namespace msm {

-template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device);
+  /**
+   * @struct MSMConfig
+   * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
+   * is to create it using [DefaultMSMConfig](@ref DefaultMSMConfig) function and then you'll hopefully only need to
+   * change a small number of default values for each of your MSMs.
+   */
+  struct MSMConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    int points_size;         /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be
+                              *   a number of different points. So, if each MSM re-uses the same set of points, this
+                              *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
+                              *   points, it should be set to the product of MSM size and [batch_size](@ref
+                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
+    int precompute_factor;   /**< The number of extra points to pre-compute for each point. Larger values decrease the
+                              *   number of computations to make, on-line memory footprint, but increase the static
+                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
+    int c;                   /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
+                              *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
+                              *   means more on-line memory footprint but also more parallelism and less computational
+                              *   complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$
+                              *   is chosen automatically). */
+    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
+                              *   but if a different (better) upper bound is known, it should be reflected in this
+                              *   variable. Default value: 0 (set to the bitsize of scalar field). */
+    int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur
+                              *   very frequently. Useful for efficient treatment of non-uniform distributions of
+                              *   scalars and "top windows" with few bits. Can be set to 0 to disable separate
+                              *   treatment of large buckets altogether. Default value: 10. */
+    int batch_size;          /**< The number of MSMs to compute. Default value: 1. */
+    bool are_scalars_on_device;       /**< True if scalars are on device and false if they're on host. Default value:
+                                       *   false. */
+    bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
+                                       *   true. */
+    bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
+    bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
+                                      *   Default value: true. */
+    bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
+                                 *   to false, `is_async` won't take effect because a synchronization is needed to
+                                 *   transfer results to the host. Default value: false. */
+    bool is_big_triangle;       /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
+                                 *   but also greatly decreases parallelism, so only suitable for large batches of MSMs.
+                                 *   Default value: false. */
+    bool is_async;              /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
+                                 *   function will block the current CPU thread. */
+  };

-template <typename S, typename P, typename A>
-void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, bool on_device);
+  /**
+   * A function that returns the default value of [MSMConfig](@ref MSMConfig) for the [MSM](@ref MSM) function.
+   * @return Default value of [MSMConfig](@ref MSMConfig).
+   */
+  template <typename A>
+  MSMConfig DefaultMSMConfig();
+
+  /**
+   * A function that computes MSM: \f$ MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i \f$.
+   * @param scalars Scalars \f$ s_i \f$. In case of batch MSM, the scalars from all MSMs are concatenated.
+   * @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+   * So, if for example all MSMs share the same base points, they can be repeated only once.
+   * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
+   * the size of 1 MSM.
+   * @param config [MSMConfig](@ref MSMConfig) used in this MSM.
+   * @param results Buffer for the result (or results in the case of batch MSM).
+   * @tparam S Scalar field type.
+   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
+   * @tparam P Output type, which is typically a [projective
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   *
+   * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
+   * effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
+   * Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
+   */
+  template <typename S, typename A, typename P>
+  cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);
+
+} // namespace msm

-template <typename A, typename S, typename P>
-void reference_msm(S* scalars, A* a_points, unsigned size);
 #endif
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -1,185 +1,120 @@
-#include <iostream>
-#include <chrono>
-#include <vector>
+#define CURVE_ID 1
+
 #include "msm.cu"
-#include "../../utils/cuda_utils.cuh"
-#include "../../primitives/projective.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "../../curves/curve_config.cuh"
 #include "../../primitives/field.cuh"
-#include "../../curves/bls12_381/curve_config.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "../../utils/device_context.cuh"

-using namespace BLS12_381;
-
-struct fake_point
+class Dummy_Scalar
 {
-  unsigned val = 0;
+public:
+  static constexpr unsigned NBITS = 32;

-  __host__ __device__ inline fake_point operator+(fake_point fp) {
-        return {val+fp.val};
-    }
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;

-  __host__ __device__ fake_point zero() {
-        fake_point p;
-        return p;
-    }
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }

+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
 };

-std::ostream& operator<<(std::ostream &strm, const fake_point &a) {
-  return strm <<a.val;
-}
-
-struct fake_scalar
+class Dummy_Projective
 {
-  unsigned val = 0;
-  unsigned bitsize = 32;
+public:
+  Dummy_Scalar x;

-  // __host__ __device__ unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width){
+  static HOST_DEVICE_INLINE Dummy_Projective zero() { return {0}; }

-  //   return (val>>(digit_num*digit_width))&((1<<digit_width)-1);
+  static HOST_DEVICE_INLINE Dummy_Projective one() { return {1}; }

+  static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective& point) { return {point.x}; }
+
+  static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective& point) { return {point.x}; }
+
+  static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective& point) { return {Dummy_Scalar::neg(point.x)}; }
+
+  friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2)
+  {
+    return {p1.x + p2.x};
+  }
+
+  // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {
+  //   return p1 + neg(p2);
  // }
-  __host__ __device__ int get_scalar_digit(int digit_num, int digit_width){
-
-    return (val>>(digit_num*digit_width))&((1<<digit_width)-1);

+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point)
+  {
+    os << point.x;
+    return os;
  }

-  __host__ __device__ inline fake_point operator*(fake_point fp) {
-      
-      fake_point p1;
-      fake_point p2;
-      unsigned x = val;
-      if (x == 0) return fake_point().zero();
-
-      unsigned i = 1;
-      unsigned c_bit = (x & (1<<(bitsize-1)))>>(bitsize-1);
-      while (c_bit==0 && i<bitsize){
-        i++;
-        c_bit = (x & (1<<(bitsize-i)))>>(bitsize-i);
-      }
-      p1 = fp;
-      p2 = p1+p1;
-      while (i<bitsize){
-        i++;
-        c_bit = (x & (1<<(bitsize-i)))>>(bitsize-i);
-        if (c_bit){
-          p1 = p1 + p2;
-          p2 = p2 + p2;
-        }
-        else {
-          p2 = p1 + p2;
-          p1 = p1 + p1;
-        }
-      }
-      
-      return p1;
+  friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point)
+  {
+    Dummy_Projective res = zero();
+#ifdef CUDA_ARCH
+#pragma unroll
+#endif
+    for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
+      if (i > 0) { res = res + res; }
+      if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) { res = res + point; }
+    }
+    return res;
  }

+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2)
+  {
+    return (p1.x == p2.x);
+  }
+
+  static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective& point) { return point.x == 0; }
+
+  static HOST_INLINE Dummy_Projective rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
 };

-class Dummy_Scalar {
-  public:
-    static constexpr unsigned NBITS = 32;
+// switch between dummy and real:

-    unsigned x;
-
-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) {
-      os << scalar.x;
-      return os;
-    }
-
-    HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) {
-      return (x>>(digit_num*digit_width))&((1<<digit_width)-1);
-    }
-
-    friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) {   
-      return {p1.x+p2.x};
-    }
-
-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) {
-      return (p1.x == p2.x);
-    }
-
-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) {
-      return (p1.x == p2);
-    }
-
-    // static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
-    static HOST_INLINE Dummy_Scalar rand_host() {
-      return {(unsigned)rand()};
-    }
-};
-
-class Dummy_Projective {
-
-  public:
-    Dummy_Scalar x;
-
-    static HOST_DEVICE_INLINE Dummy_Projective zero() {
-      return {0};
-    }
-
-    static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) {
-      return {point.x};
-    }
-
-    static HOST_DEVICE_INLINE Dummy_Projective from_affine(const Dummy_Projective &point) {
-      return {point.x};
-    }
-
-    // static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
-
-    friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) {   
-      return {p1.x+p2.x};
-    }
-
-    // friend HOST_DEVICE_INLINE Dummy_Projective operator-(Dummy_Projective p1, const Dummy_Projective& p2) {   
-    //   return p1 + neg(p2);
-    // }
-
-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Projective& point) {
-      os << point.x;
-      return os;
-    }
-
-    friend HOST_DEVICE_INLINE Dummy_Projective operator*(Dummy_Scalar scalar, const Dummy_Projective& point) {   
-      Dummy_Projective res = zero();
-  #ifdef CUDA_ARCH
-  #pragma unroll
-  #endif
-      for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
-        if (i > 0) {
-          res = res + res;
-        }
-        if (scalar.get_scalar_digit(Dummy_Scalar::NBITS - i - 1, 1)) {
-          res = res + point;
-        }
-      }
-      return res;
-    }
-
-    friend HOST_DEVICE_INLINE bool operator==(const Dummy_Projective& p1, const Dummy_Projective& p2) {
-      return (p1.x == p2.x);
-    }
-
-    static HOST_DEVICE_INLINE bool is_zero(const Dummy_Projective &point) {
-      return point.x == 0;
-    }
-
-    static HOST_INLINE Dummy_Projective rand_host() {
-      return {(unsigned)rand()};
-    }
-};
-
-//switch between dummy and real:
-
-typedef scalar_t test_scalar;
-typedef projective_t test_projective;
-typedef affine_t test_affine;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;

 // typedef Dummy_Scalar test_scalar;
 // typedef Dummy_Projective test_projective;
@@ -187,63 +122,117 @@ typedef affine_t test_affine;

 int main()
 {
-  unsigned batch_size = 4;
-  unsigned msm_size = 1<<15;
-  unsigned N = batch_size*msm_size;
+  int batch_size = 1;
+  //   unsigned msm_size = 1<<21;
+  int msm_size = 12180757;
+  int N = batch_size * msm_size;

-  test_scalar *scalars = new test_scalar[N];
-  test_affine *points = new test_affine[N];
-  
-  for (unsigned i=0;i<N;i++){
-    scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
-    points[i] = (i%msm_size < 10)? test_projective::to_affine(test_projective::rand_host()): points[i-10];
-    // scalars[i] = test_scalar::rand_host();
-    // points[i] = test_projective::to_affine(test_projective::rand_host());
-  }
-  std::cout<<"finished generating"<<std::endl;
+  test_scalar* scalars = new test_scalar[N];
+  test_affine* points = new test_affine[N];
+
+  test_scalar::RandHostMany(scalars, N);
+  test_projective::RandHostManyAffine(points, N);
+
+  std::cout << "finished generating" << std::endl;

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
  test_projective large_res[batch_size];
-  test_projective batched_large_res[batch_size];
+  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];

-
  // short_msm<scalar_t, projective_t, affine_t>(scalars, points, N, short_res);
-  for (unsigned i=0;i<batch_size;i++){
-    large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
-    // std::cout<<"final result large"<<std::endl;
-    // std::cout<<test_projective::to_affine(*large_res)<<std::endl;
-  }
+  // for (unsigned i=0;i<batch_size;i++){
+  // large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i,
+  // false); std::cout<<"final result large"<<std::endl; std::cout<<test_projective::to_affine(*large_res)<<std::endl;
+  // }
+
+  test_scalar* scalars_d;
+  test_affine* points_d;
+  test_projective* large_res_d;
+
+  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
+  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
+  cudaMalloc(&large_res_d, sizeof(test_projective));
+  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
+
+  std::cout << "finished copying" << std::endl;
+
+  // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size,
+  // batched_large_res, false);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  device_context::DeviceContext ctx = {
+    stream, // stream
+    0,      // device_id
+    0,      // mempool
+  };
+  msm::MSMConfig config = {
+    ctx,   // DeviceContext
+    0,     // points_size
+    1,     // precompute_factor
+    0,     // c
+    0,     // bitsize
+    10,    // large_bucket_factor
+    1,     // batch_size
+    false, // are_scalars_on_device
+    false, // are_scalars_montgomery_form
+    false, // are_points_on_device
+    false, // are_points_montgomery_form
+    true,  // are_results_on_device
+    false, // is_big_triangle
+    true,  // is_async
+  };
+
+  auto begin1 = std::chrono::high_resolution_clock::now();
+  msm::MSM<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
+  cudaEvent_t msm_end_event;
+  cudaEventCreate(&msm_end_event);
+  auto end1 = std::chrono::high_resolution_clock::now();
+  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  config.is_big_triangle = true;
+  config.are_results_on_device = false;
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
  auto begin = std::chrono::high_resolution_clock::now();
-  batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
-  // large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false);
+  msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
+  // test_reduce_triangle(scalars);
+  // test_reduce_rectangle(scalars);
+  // test_reduce_single(scalars);
+  // test_reduce_var(scalars);
  auto end = std::chrono::high_resolution_clock::now();
  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
-  printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
-  std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);

-  // reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+
+  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
+
+  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
+
+  // std::cout<<"final results batched large"<<std::endl;
+  // bool success = true;
+  // for (unsigned i = 0; i < batch_size; i++)
+  // {
+  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
+  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
+  //     std::cout<<"good"<<std::endl;
+  //   }
+  //   else{
+  //     std::cout<<"miss"<<std::endl;
+  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
+  //     success = false;
+  //   }
+  // }
+  // if (success){
+  //   std::cout<<"success!"<<std::endl;
+  // }

-  std::cout<<"final results batched large"<<std::endl;
-  bool success = true;
-  for (unsigned i = 0; i < batch_size; i++)
-  {
-    std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
-    if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
-      std::cout<<"good"<<std::endl;
-    }
-    else{
-      std::cout<<"miss"<<std::endl;
-      std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
-      success = false;
-    }
-  }
-  if (success){
-    std::cout<<"success!"<<std::endl;
-  }
-  
  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
  // std::cout<<projective_t::to_affine(batched_large_res[0])<<std::endl;
@@ -253,4 +242,4 @@ int main()
  // std::cout<<pr<<std::endl;

  return 0;
-}
+}
--- a/icicle/appUtils/ntt/Makefile
+++ b/icicle/appUtils/ntt/Makefile
@@ -0,0 +1,6 @@
+build_verification:
+	mkdir -p work
+	nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17
+
+test_verification: build_verification
+	work/test_verification
--- a/icicle/appUtils/ntt/kernel_ntt.cu
+++ b/icicle/appUtils/ntt/kernel_ntt.cu
@@ -0,0 +1,933 @@
+
+#include "appUtils/ntt/thread_ntt.cu"
+#include "curves/curve_config.cuh"
+#include "utils/sharedmem.cuh"
+#include "appUtils/ntt/ntt.cuh" // for Ordering
+
+namespace ntt {
+
+  static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
+  {
+    uint32_t rev_num = 0, temp, dig_len;
+    if (dit) {
+      for (int i = 4; i >= 0; i--) {
+        dig_len = fast_tw ? STAGE_SIZES_DEVICE_FT[log_size][i] : STAGE_SIZES_DEVICE[log_size][i];
+        temp = num & ((1 << dig_len) - 1);
+        num = num >> dig_len;
+        rev_num = rev_num << dig_len;
+        rev_num = rev_num | temp;
+      }
+    } else {
+      for (int i = 0; i < 5; i++) {
+        dig_len = fast_tw ? STAGE_SIZES_DEVICE_FT[log_size][i] : STAGE_SIZES_DEVICE[log_size][i];
+        temp = num & ((1 << dig_len) - 1);
+        num = num >> dig_len;
+        rev_num = rev_num << dig_len;
+        rev_num = rev_num | temp;
+      }
+    }
+    return rev_num;
+  }
+
+  static inline __device__ uint32_t bit_rev(uint32_t num, uint32_t log_size) { return __brev(num) >> (32 - log_size); }
+
+  enum eRevType { None, RevToMixedRev, MixedRevToRev, NaturalToMixedRev, NaturalToRev, MixedRevToNatural };
+
+  static __device__ uint32_t generalized_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type)
+  {
+    switch (rev_type) {
+    case eRevType::RevToMixedRev:
+      // R -> N -> MR
+      return dig_rev(bit_rev(num, log_size), log_size, dit, fast_tw);
+    case eRevType::MixedRevToRev:
+      // MR -> N -> R
+      return bit_rev(dig_rev(num, log_size, dit, fast_tw), log_size);
+    case eRevType::NaturalToMixedRev:
+    case eRevType::MixedRevToNatural:
+      return dig_rev(num, log_size, dit, fast_tw);
+    case eRevType::NaturalToRev:
+      return bit_rev(num, log_size);
+    default:
+      return num;
+    }
+    return num;
+  }
+
+  // Note: the following reorder kernels are fused with normalization for INTT
+  template <typename E, typename S, uint32_t MAX_GROUP_SIZE = 80>
+  static __global__ void reorder_digits_inplace_and_normalize_kernel(
+    E* arr, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type, bool is_normalize, S inverse_N)
+  {
+    // launch N threads (per batch element)
+    // each thread starts from one index and calculates the corresponding group
+    // if its index is the smallest number in the group -> do the memory transformation
+    //  else --> do nothing
+
+    const uint32_t size = 1 << log_size;
+    const uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+    const uint32_t idx = tid % size;
+    const uint32_t batch_idx = tid / size;
+
+    uint32_t next_element = idx;
+    uint32_t group[MAX_GROUP_SIZE];
+    group[0] = next_element + size * batch_idx;
+
+    uint32_t i = 1;
+    for (; i < MAX_GROUP_SIZE;) {
+      next_element = generalized_rev(next_element, log_size, dit, fast_tw, rev_type);
+      if (next_element < idx) return; // not handling this group
+      if (next_element == idx) break; // calculated whole group
+      group[i++] = next_element + size * batch_idx;
+    }
+
+    --i;
+    // reaching here means I am handling this group
+    const E last_element_in_group = arr[group[i]];
+    for (; i > 0; --i) {
+      arr[group[i]] = is_normalize ? (arr[group[i - 1]] * inverse_N) : arr[group[i - 1]];
+    }
+    arr[group[0]] = is_normalize ? (last_element_in_group * inverse_N) : last_element_in_group;
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void reorder_digits_and_normalize_kernel(
+    E* arr,
+    E* arr_reordered,
+    uint32_t log_size,
+    bool dit,
+    bool fast_tw,
+    eRevType rev_type,
+    bool is_normalize,
+    S inverse_N)
+  {
+    uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+    uint32_t rd = tid;
+    uint32_t wr =
+      ((tid >> log_size) << log_size) + generalized_rev(tid & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
+    arr_reordered[wr] = is_normalize ? arr[rd] * inverse_N : arr[rd];
+  }
+
+  template <typename E, typename S>
+  static __global__ void batch_elementwise_mul_with_reorder(
+    E* in_vec,
+    int n_elements,
+    int batch_size,
+    S* scalar_vec,
+    int step,
+    int n_scalars,
+    int logn,
+    eRevType rev_type,
+    bool dit,
+    E* out_vec)
+  {
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid >= n_elements * batch_size) return;
+    int64_t scalar_id = tid % n_elements;
+    if (rev_type != eRevType::None) scalar_id = generalized_rev(tid, logn, dit, false, rev_type);
+    out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void ntt64(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t nof_ntt_blocks,
+    uint32_t data_stride,
+    uint32_t log_data_stride,
+    uint32_t twiddle_stride,
+    bool strided,
+    uint32_t stage_num,
+    bool inv,
+    bool dit,
+    bool fast_tw)
+  {
+    NTTEngine<E, S> engine;
+    stage_metadata s_meta;
+    SharedMemory<E> smem;
+    E* shmem = smem.getPointer();
+
+    s_meta.th_stride = 8;
+    s_meta.ntt_block_size = 64;
+    s_meta.ntt_block_id = (blockIdx.x << 3) + (strided ? (threadIdx.x & 0x7) : (threadIdx.x >> 3));
+    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 3) : (threadIdx.x & 0x7);
+
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
+
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
+    if (twiddle_stride && dit) {
+      if (fast_tw)
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric64(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    if (fast_tw)
+      engine.loadInternalTwiddles64(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric64(internal_twiddles, strided, inv);
+
+#pragma unroll 1
+    for (uint32_t phase = 0; phase < 2; phase++) {
+      engine.ntt8win();
+      if (phase == 0) {
+        engine.SharedData64Columns8(shmem, true, false, strided); // store
+        __syncthreads();
+        engine.SharedData64Rows8(shmem, false, false, strided); // load
+        engine.twiddlesInternal();
+      }
+    }
+
+    if (twiddle_stride && !dit) {
+      if (fast_tw)
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric64(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void ntt32(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t nof_ntt_blocks,
+    uint32_t data_stride,
+    uint32_t log_data_stride,
+    uint32_t twiddle_stride,
+    bool strided,
+    uint32_t stage_num,
+    bool inv,
+    bool dit,
+    bool fast_tw)
+  {
+    NTTEngine<E, S> engine;
+    stage_metadata s_meta;
+
+    SharedMemory<E> smem;
+    E* shmem = smem.getPointer();
+
+    s_meta.th_stride = 4;
+    s_meta.ntt_block_size = 32;
+    s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
+    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);
+
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
+
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
+    if (fast_tw)
+      engine.loadInternalTwiddles32(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric32(internal_twiddles, strided, inv);
+    engine.ntt8win();
+    engine.twiddlesInternal();
+    engine.SharedData32Columns8(shmem, true, false, strided); // store
+    __syncthreads();
+    engine.SharedData32Rows4_2(shmem, false, false, strided); // load
+    engine.ntt4_2();
+    if (twiddle_stride) {
+      if (fast_tw)
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric32(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    engine.storeGlobalData32(out, data_stride, log_data_stride, log_size, strided, s_meta);
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void ntt32dit(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t nof_ntt_blocks,
+    uint32_t data_stride,
+    uint32_t log_data_stride,
+    uint32_t twiddle_stride,
+    bool strided,
+    uint32_t stage_num,
+    bool inv,
+    bool dit,
+    bool fast_tw)
+  {
+    NTTEngine<E, S> engine;
+    stage_metadata s_meta;
+
+    SharedMemory<E> smem;
+    E* shmem = smem.getPointer();
+
+    s_meta.th_stride = 4;
+    s_meta.ntt_block_size = 32;
+    s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
+    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);
+
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
+
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
+    engine.loadGlobalData32(in, data_stride, log_data_stride, log_size, strided, s_meta);
+    if (twiddle_stride) {
+      if (fast_tw)
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric32(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    if (fast_tw)
+      engine.loadInternalTwiddles32(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric32(internal_twiddles, strided, inv);
+    engine.ntt4_2();
+    engine.SharedData32Columns4_2(shmem, true, false, strided); // store
+    __syncthreads();
+    engine.SharedData32Rows8(shmem, false, false, strided); // load
+    engine.twiddlesInternal();
+    engine.ntt8win();
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void ntt16(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t nof_ntt_blocks,
+    uint32_t data_stride,
+    uint32_t log_data_stride,
+    uint32_t twiddle_stride,
+    bool strided,
+    uint32_t stage_num,
+    bool inv,
+    bool dit,
+    bool fast_tw)
+  {
+    NTTEngine<E, S> engine;
+    stage_metadata s_meta;
+
+    SharedMemory<E> smem;
+    E* shmem = smem.getPointer();
+
+    s_meta.th_stride = 2;
+    s_meta.ntt_block_size = 16;
+    s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
+    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);
+
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
+
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
+    if (fast_tw)
+      engine.loadInternalTwiddles16(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric16(internal_twiddles, strided, inv);
+    engine.ntt8win();
+    engine.twiddlesInternal();
+    engine.SharedData16Columns8(shmem, true, false, strided); // store
+    __syncthreads();
+    engine.SharedData16Rows2_4(shmem, false, false, strided); // load
+    engine.ntt2_4();
+    if (twiddle_stride) {
+      if (fast_tw)
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric16(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    engine.storeGlobalData16(out, data_stride, log_data_stride, log_size, strided, s_meta);
+  }
+
+  template <typename E, typename S>
+  __launch_bounds__(64) __global__ void ntt16dit(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t nof_ntt_blocks,
+    uint32_t data_stride,
+    uint32_t log_data_stride,
+    uint32_t twiddle_stride,
+    bool strided,
+    uint32_t stage_num,
+    bool inv,
+    bool dit,
+    bool fast_tw)
+  {
+    NTTEngine<E, S> engine;
+    stage_metadata s_meta;
+
+    SharedMemory<E> smem;
+    E* shmem = smem.getPointer();
+
+    s_meta.th_stride = 2;
+    s_meta.ntt_block_size = 16;
+    s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
+    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);
+
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;
+
+    if (fast_tw)
+      engine.loadBasicTwiddles(basic_twiddles);
+    else
+      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
+    engine.loadGlobalData16(in, data_stride, log_data_stride, log_size, strided, s_meta);
+    if (twiddle_stride) {
+      if (fast_tw)
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
+      else
+        engine.loadExternalTwiddlesGeneric16(
+          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
+      engine.twiddlesExternal();
+    }
+    if (fast_tw)
+      engine.loadInternalTwiddles16(internal_twiddles, strided);
+    else
+      engine.loadInternalTwiddlesGeneric16(internal_twiddles, strided, inv);
+    engine.ntt2_4();
+    engine.SharedData16Columns2_4(shmem, true, false, strided); // store
+    __syncthreads();
+    engine.SharedData16Rows8(shmem, false, false, strided); // load
+    engine.twiddlesInternal();
+    engine.ntt8win();
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
+  }
+
+  template <typename E, typename S>
+  __global__ void normalize_kernel(E* data, S norm_factor)
+  {
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    data[tid] = data[tid] * norm_factor;
+  }
+
+  template <typename S>
+  __global__ void generate_base_table(S basic_root, S* base_table, uint32_t skip)
+  {
+    S w = basic_root;
+    S t = S::one();
+    for (int i = 0; i < 64; i += skip) {
+      base_table[i] = t;
+      t = t * w;
+    }
+  }
+
+  // Generic twiddles: 1N twiddles for forward and inverse NTT
+  template <typename S>
+  __global__ void generate_basic_twiddles_generic(S basic_root, S* w6_table, S* basic_twiddles)
+  {
+    S w0 = basic_root * basic_root;
+    S w1 = (basic_root + w0 * basic_root) * S::inv_log_size(1);
+    S w2 = (basic_root - w0 * basic_root) * S::inv_log_size(1);
+    basic_twiddles[0] = w0;
+    basic_twiddles[1] = w1;
+    basic_twiddles[2] = w2;
+    S basic_inv = w6_table[64 - 8];
+    w0 = basic_inv * basic_inv;
+    w1 = (basic_inv + w0 * basic_inv) * S::inv_log_size(1);
+    w2 = (basic_inv - w0 * basic_inv) * S::inv_log_size(1);
+    basic_twiddles[3] = w0;
+    basic_twiddles[4] = w1;
+    basic_twiddles[5] = w2;
+  }
+
+  template <typename S>
+  __global__ void generate_twiddle_combinations_generic(
+    S* w6_table, S* w12_table, S* w18_table, S* w24_table, S* w30_table, S* external_twiddles, uint32_t log_size)
+  {
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t exp = tid << (30 - log_size);
+    S w6, w12, w18, w24, w30;
+    w6 = w6_table[exp >> 24];
+    w12 = w12_table[((exp >> 18) & 0x3f)];
+    w18 = w18_table[((exp >> 12) & 0x3f)];
+    w24 = w24_table[((exp >> 6) & 0x3f)];
+    w30 = w30_table[(exp & 0x3f)];
+    S t = w6 * w12 * w18 * w24 * w30;
+    external_twiddles[tid] = t;
+  }
+
+  template <typename S>
+  __global__ void set_value(S* arr, int idx, S val)
+  {
+    arr[idx] = val;
+  }
+
+  template <typename S>
+  cudaError_t generate_external_twiddles_generic(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    const int n = pow(2, log_size);
+    CHK_IF_RETURN(cudaMallocAsync(&basic_twiddles, 6 * sizeof(S), stream));
+
+    S* w6_table;
+    S* w12_table;
+    S* w18_table;
+    S* w24_table;
+    S* w30_table;
+    CHK_IF_RETURN(cudaMallocAsync(&w6_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w12_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w18_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w24_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w30_table, sizeof(S) * 64, stream));
+
+    // Note: for compatibility with radix-2 INTT, need ONE in last element (in addition to first element)
+    set_value<<<1, 1, 0, stream>>>(external_twiddles, n /*last element idx*/, S::one());
+
+    cudaStreamSynchronize(stream);
+
+    S temp_root = basic_root;
+    generate_base_table<<<1, 1, 0, stream>>>(basic_root, w30_table, 1 << (30 - log_size));
+
+    if (log_size > 24)
+      for (int i = 0; i < 6 - (30 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w24_table, 1 << (log_size > 24 ? 0 : 24 - log_size));
+
+    if (log_size > 18)
+      for (int i = 0; i < 6 - (log_size > 24 ? 0 : 24 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w18_table, 1 << (log_size > 18 ? 0 : 18 - log_size));
+
+    if (log_size > 12)
+      for (int i = 0; i < 6 - (log_size > 18 ? 0 : 18 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w12_table, 1 << (log_size > 12 ? 0 : 12 - log_size));
+
+    if (log_size > 6)
+      for (int i = 0; i < 6 - (log_size > 12 ? 0 : 12 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w6_table, 1 << (log_size > 6 ? 0 : 6 - log_size));
+
+    if (log_size > 2)
+      for (int i = 0; i < 3 - (log_size > 6 ? 0 : 6 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_basic_twiddles_generic<<<1, 1, 0, stream>>>(temp_root, w6_table, basic_twiddles);
+
+    const int NOF_BLOCKS = (log_size >= 8) ? (1 << (log_size - 8)) : 1;
+    const int NOF_THREADS = (log_size >= 8) ? 256 : (1 << log_size);
+    generate_twiddle_combinations_generic<<<NOF_BLOCKS, NOF_THREADS, 0, stream>>>(
+      w6_table, w12_table, w18_table, w24_table, w30_table, external_twiddles, log_size);
+
+    internal_twiddles = w6_table;
+
+    CHK_IF_RETURN(cudaFreeAsync(w12_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w18_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w24_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w30_table, stream));
+
+    return CHK_LAST();
+  }
+
+  // Fast-twiddles: 2N twiddles for forward, 2N for inverse
+  template <typename S>
+  __global__ void generate_basic_twiddles_fast_twiddles_mode(S basic_root, S* basic_twiddles)
+  {
+    S w0 = basic_root * basic_root;
+    S w1 = (basic_root + w0 * basic_root) * S::inv_log_size(1);
+    S w2 = (basic_root - w0 * basic_root) * S::inv_log_size(1);
+    basic_twiddles[0] = w0;
+    basic_twiddles[1] = w1;
+    basic_twiddles[2] = w2;
+  }
+
+  template <typename S>
+  __global__ void generate_twiddle_combinations_fast_twiddles_mode(
+    S* w6_table,
+    S* w12_table,
+    S* w18_table,
+    S* w24_table,
+    S* w30_table,
+    S* external_twiddles,
+    uint32_t log_size,
+    uint32_t prev_log_size)
+  {
+    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint32_t exp = ((tid & ((1 << prev_log_size) - 1)) * (tid >> prev_log_size)) << (30 - log_size);
+    S w6, w12, w18, w24, w30;
+    w6 = w6_table[exp >> 24];
+    w12 = w12_table[((exp >> 18) & 0x3f)];
+    w18 = w18_table[((exp >> 12) & 0x3f)];
+    w24 = w24_table[((exp >> 6) & 0x3f)];
+    w30 = w30_table[(exp & 0x3f)];
+    S t = w6 * w12 * w18 * w24 * w30;
+    external_twiddles[tid + (1 << log_size) - 1] = t;
+  }
+
+  template <typename S>
+  cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    S* w6_table;
+    S* w12_table;
+    S* w18_table;
+    S* w24_table;
+    S* w30_table;
+    CHK_IF_RETURN(cudaMallocAsync(&w6_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w12_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w18_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w24_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&w30_table, sizeof(S) * 64, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&basic_twiddles, 3 * sizeof(S), stream));
+
+    S temp_root = basic_root;
+    generate_base_table<<<1, 1, 0, stream>>>(basic_root, w30_table, 1 << (30 - log_size));
+    if (log_size > 24)
+      for (int i = 0; i < 6 - (30 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w24_table, 1 << (log_size > 24 ? 0 : 24 - log_size));
+    if (log_size > 18)
+      for (int i = 0; i < 6 - (log_size > 24 ? 0 : 24 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w18_table, 1 << (log_size > 18 ? 0 : 18 - log_size));
+    if (log_size > 12)
+      for (int i = 0; i < 6 - (log_size > 18 ? 0 : 18 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w12_table, 1 << (log_size > 12 ? 0 : 12 - log_size));
+    if (log_size > 6)
+      for (int i = 0; i < 6 - (log_size > 12 ? 0 : 12 - log_size); i++)
+        temp_root = temp_root * temp_root;
+    generate_base_table<<<1, 1, 0, stream>>>(temp_root, w6_table, 1 << (log_size > 6 ? 0 : 6 - log_size));
+    for (int i = 0; i < 3 - (log_size > 6 ? 0 : 6 - log_size); i++)
+      temp_root = temp_root * temp_root;
+    generate_basic_twiddles_fast_twiddles_mode<<<1, 1, 0, stream>>>(temp_root, basic_twiddles);
+
+    for (int i = 8; i < log_size + 1; i++) {
+      generate_twiddle_combinations_fast_twiddles_mode<<<1 << (i - 8), 256, 0, stream>>>(
+        w6_table, w12_table, w18_table, w24_table, w30_table, external_twiddles, i, STAGE_PREV_SIZES[i]);
+    }
+    internal_twiddles = w6_table;
+
+    CHK_IF_RETURN(cudaFreeAsync(w12_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w18_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w24_table, stream));
+    CHK_IF_RETURN(cudaFreeAsync(w30_table, stream));
+
+    return CHK_LAST();
+  }
+
+  template <typename E, typename S>
+  cudaError_t large_ntt(
+    E* in,
+    E* out,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    uint32_t log_size,
+    uint32_t tw_log_size,
+    uint32_t batch_size,
+    bool inv,
+    bool normalize,
+    bool dit,
+    bool fast_tw,
+    cudaStream_t cuda_stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    if (log_size == 1 || log_size == 2 || log_size == 3 || log_size == 7) {
+      throw IcicleError(IcicleError_t::InvalidArgument, "size not implemented for mixed-radix-NTT");
+    }
+
+    if (log_size == 4) {
+      const int NOF_THREADS = min(64, 2 * batch_size);
+      const int NOF_BLOCKS = (2 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+
+      if (dit) {
+        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
+      } else { // dif
+        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
+      }
+      if (normalize) normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4));
+      return CHK_LAST();
+    }
+
+    if (log_size == 5) {
+      const int NOF_THREADS = min(64, 4 * batch_size);
+      const int NOF_BLOCKS = (4 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+      if (dit) {
+        ntt32dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
+      } else { // dif
+        ntt32<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
+      }
+      if (normalize) normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5));
+      return CHK_LAST();
+    }
+
+    if (log_size == 6) {
+      const int NOF_THREADS = min(64, 8 * batch_size);
+      const int NOF_BLOCKS = (8 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+      ntt64<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+        in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+        false, 0, inv, dit, fast_tw);
+      if (normalize) normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6));
+      return CHK_LAST();
+    }
+
+    if (log_size == 8) {
+      const int NOF_THREADS = 64;
+      const int NOF_BLOCKS = (32 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+      if (dit) {
+        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
+        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
+      } else { // dif
+        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
+        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
+          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
+      }
+      if (normalize) normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8));
+      return CHK_LAST();
+    }
+
+    // general case:
+    uint32_t nof_blocks = (1 << (log_size - 9)) * batch_size;
+    if (dit) {
+      for (int i = 0; i < 5; i++) {
+        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
+        uint32_t stride_log = 0;
+        for (int j = 0; j < i; j++)
+          stride_log += fast_tw ? STAGE_SIZES_HOST_FT[log_size][j] : STAGE_SIZES_HOST[log_size][j];
+        if (stage_size == 6)
+          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+        else if (stage_size == 5)
+          ntt32dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+        else if (stage_size == 4)
+          ntt16dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+      }
+    } else { // dif
+      bool first_run = false, prev_stage = false;
+      for (int i = 4; i >= 0; i--) {
+        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
+        uint32_t stride_log = 0;
+        for (int j = 0; j < i; j++)
+          stride_log += fast_tw ? STAGE_SIZES_HOST_FT[log_size][j] : STAGE_SIZES_HOST[log_size][j];
+        first_run = stage_size && !prev_stage;
+        if (stage_size == 6)
+          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+        else if (stage_size == 5)
+          ntt32<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+        else if (stage_size == 4)
+          ntt16<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
+            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
+        prev_stage = stage_size;
+      }
+    }
+    if (normalize)
+      normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(log_size));
+
+    return CHK_LAST();
+  }
+
+  template <typename E, typename S>
+  cudaError_t mixed_radix_ntt(
+    E* d_input,
+    E* d_output,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    int ntt_size,
+    int max_logn,
+    int batch_size,
+    bool is_inverse,
+    bool fast_tw,
+    Ordering ordering,
+    S* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    const int logn = int(log2(ntt_size));
+    const int NOF_BLOCKS = ((1 << logn) * batch_size + 64 - 1) / 64;
+    const int NOF_THREADS = min(64, (1 << logn) * batch_size);
+
+    bool is_normalize = is_inverse;
+    const bool is_on_coset = (coset_gen_index != 0) || arbitrary_coset;
+    const int n_twiddles = 1 << max_logn;
+    // Note: for evaluation on coset, need to reorder the coset too to match the data for element-wise multiplication
+    eRevType reverse_input = None, reverse_output = None, reverse_coset = None;
+    bool dit = false;
+    switch (ordering) {
+    case Ordering::kNN:
+      reverse_input = eRevType::NaturalToMixedRev;
+      dit = true;
+      break;
+    case Ordering::kRN:
+      reverse_input = eRevType::RevToMixedRev;
+      dit = true;
+      reverse_coset = is_inverse ? eRevType::None : eRevType::NaturalToRev;
+      break;
+    case Ordering::kNR:
+      reverse_output = eRevType::MixedRevToRev;
+      reverse_coset = is_inverse ? eRevType::NaturalToRev : eRevType::None;
+      break;
+    case Ordering::kRR:
+      reverse_input = eRevType::RevToMixedRev;
+      dit = true;
+      reverse_output = eRevType::NaturalToRev;
+      reverse_coset = eRevType::NaturalToRev;
+      break;
+    case Ordering::kMN:
+      dit = true;
+      reverse_coset = is_inverse ? None : eRevType::NaturalToMixedRev;
+      break;
+    case Ordering::kNM:
+      reverse_coset = is_inverse ? eRevType::NaturalToMixedRev : eRevType::None;
+      break;
+    }
+
+    if (is_on_coset && !is_inverse) {
+      batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+        d_input, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles,
+        arbitrary_coset ? 1 : coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);
+
+      d_input = d_output;
+    }
+
+    if (reverse_input != eRevType::None) {
+      const bool is_reverse_in_place = (d_input == d_output);
+      if (is_reverse_in_place) {
+        reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+          d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
+      } else {
+        reorder_digits_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+          d_input, d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
+      }
+      is_normalize = false;
+      d_input = d_output;
+    }
+
+    // inplace ntt
+    CHK_IF_RETURN(large_ntt(
+      d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size, is_inverse,
+      (is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));
+
+    if (reverse_output != eRevType::None) {
+      reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+        d_output, logn, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
+    }
+
+    if (is_on_coset && is_inverse) {
+      batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+        d_output, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles + n_twiddles,
+        arbitrary_coset ? 1 : -coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);
+    }
+
+    return CHK_LAST();
+  }
+
+  // Explicit instantiation for scalar type
+  template cudaError_t generate_external_twiddles_generic(
+    const curve_config::scalar_t& basic_root,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t*& internal_twiddles,
+    curve_config::scalar_t*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
+  template cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const curve_config::scalar_t& basic_root,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t*& internal_twiddles,
+    curve_config::scalar_t*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
+  template cudaError_t mixed_radix_ntt<curve_config::scalar_t, curve_config::scalar_t>(
+    curve_config::scalar_t* d_input,
+    curve_config::scalar_t* d_output,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t* internal_twiddles,
+    curve_config::scalar_t* basic_twiddles,
+    int ntt_size,
+    int max_logn,
+    int batch_size,
+    bool is_inverse,
+    bool fast_tw,
+    Ordering ordering,
+    curve_config::scalar_t* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream);
+
+} // namespace ntt
--- a/icicle/appUtils/ntt/lde.cu
+++ b/icicle/appUtils/ntt/lde.cu
@@ -1,182 +0,0 @@
-#ifndef LDE
-#define LDE
-#include <cuda.h>
-#include "ntt.cuh"
-#include "lde.cuh"
-#include "../vector_manipulation/ve_mod_mult.cuh"
-
-
-/**
- * Interpolate a batch of polynomials from their evaluations on the same subgroup.
- * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
- * @param d_out The variable to write coefficients of the resulting polynomials into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
- * @param d_evaluations Input array of evaluations of all polynomials of type E (elements).
- * @param d_domain Domain on which the polynomials are evaluated. Must be a subgroup.
- * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
- * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
- */
-template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size) {
-  uint32_t logn = uint32_t(log(n) / log(2));
-  cudaMemcpy(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice);
-  
-  int NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
-  int NUM_BLOCKS = batch_size * max(int((n / 2) / NUM_THREADS), 1);
-  for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
-  {
-    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, n, d_domain, n, NUM_BLOCKS, s, false);
-  }
-
-  NUM_BLOCKS = (n * batch_size + NUM_THREADS - 1) / NUM_THREADS;
-  template_normalize_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>> (d_out, n * batch_size, S::inv_log_size(logn));
-  return 0;
-}
-
-/**
- * Interpolate a polynomial from its evaluations on a subgroup.
- * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
- * @param d_out The variable to write coefficients of the resulting polynomial into (the coefficients are in bit-reversed order if the evaluations weren't bit-reversed and vice-versa).
- * @param d_evaluations Input array of evaluations that have type E (elements).
- * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
- * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
- */
-template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n) {
-  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1);
-}
-
-template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) {
-    arr[tid] = val;
-  }
-}
-
-/**
- * Evaluate a batch of polynomials on the same coset.
- * @param d_out The evaluations of the polynomials on coset `u` * `d_domain`.
- * @param d_coefficients Input array of coefficients of all polynomials of type E (elements) to be evaluated in-place on a coset.
- * @param d_domain Domain on which the polynomials are evaluated (see `coset` flag). Must be a subgroup.
- * @param domain_size Length of `d_domain` array, on which the polynomial is computed.
- * @param n The number of coefficients, which might be different from `domain_size`.
- * @param batch_size The size of the batch; the length of `d_coefficients` is `n` * `batch_size`.
- * @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
- * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
- */
-template <typename E, typename S>
-int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, unsigned batch_size, bool coset, S * coset_powers) {
-  uint32_t logn = uint32_t(log(domain_size) / log(2));
-  if (domain_size > n) {
-    // allocate and initialize an array of stream handles to parallelize data copying across batches
-    cudaStream_t *memcpy_streams = (cudaStream_t *) malloc(batch_size * sizeof(cudaStream_t));
-    for (unsigned i = 0; i < batch_size; i++)
-    {
-      cudaStreamCreate(&(memcpy_streams[i]));
-
-      cudaMemcpyAsync(&d_out[i * domain_size], &d_coefficients[i * n], n * sizeof(E), cudaMemcpyDeviceToDevice, memcpy_streams[i]);
-      uint32_t NUM_THREADS = MAX_THREADS_BATCH;
-      uint32_t NUM_BLOCKS = (domain_size - n + NUM_THREADS - 1) / NUM_THREADS;
-      fill_array <E> <<<NUM_BLOCKS, NUM_THREADS, 0, memcpy_streams[i]>>> (&d_out[i * domain_size + n], E::zero(), domain_size - n);
-
-      cudaStreamSynchronize(memcpy_streams[i]);
-      cudaStreamDestroy(memcpy_streams[i]);
-    }
-  } else
-    cudaMemcpy(d_out, d_coefficients, sizeof(E) * domain_size * batch_size, cudaMemcpyDeviceToDevice);
-
-  if (coset)
-    batch_vector_mult(coset_powers, d_out, domain_size, batch_size);
-
-  int NUM_THREADS = min(domain_size / 2, MAX_THREADS_BATCH);
-  int chunks = max(int((domain_size / 2) / NUM_THREADS), 1);
-  int NUM_BLOCKS = batch_size * chunks;
-  for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
-  {
-    ntt_template_kernel <E, S> <<<NUM_BLOCKS, NUM_THREADS>>>(d_out, domain_size, d_domain, domain_size, batch_size * chunks, logn - s - 1, true);
-  }
-  return 0;
-}
-
-/**
- * Evaluate a polynomial on a coset.
- * Note: this function does not preform any bit-reverse permutations on its inputs or outputs, so the order of outputs is bit-reversed.
- * @param d_out The evaluations of the polynomial on coset `u` * `d_domain`.
- * @param d_coefficients Input array of coefficients of a polynomial of type E (elements).
- * @param d_domain Domain on which the polynomial is evaluated (see `coset` flag). Must be a subgroup.
- * @param domain_size Length of `d_domain` array, on which the polynomial is computed.
- * @param n The number of coefficients, which might be different from `domain_size`.
- * @param coset The flag that indicates whether to evaluate on a coset. If false, evaluate on a subgroup `d_domain`.
- * @param coset_powers If `coset` is true, a list of powers `[1, u, u^2, ..., u^{n-1}]` where `u` is the generator of the coset.
- */
-template <typename E, typename S> 
-int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size, unsigned n, bool coset, S * coset_powers) {
-  return evaluate_batch <E, S> (d_out, d_coefficients, d_domain, domain_size, n, 1, coset, coset_powers);
-}
-
-template <typename S> 
-int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n) {
-  return interpolate(d_out, d_evaluations, d_domain, n);
-}
-
-template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
-}
-
-template <typename E, typename S> 
-int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n) {
-  return interpolate(d_out, d_evaluations, d_domain, n);
-}
-
-template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size);
-}
-
-template <typename S> 
-int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
-  S* _null = nullptr;
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
-}
-
-template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size) {
-  S* _null = nullptr;
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
-}
-
-template <typename E, typename S> 
-int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n) {
-  S* _null = nullptr;
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, false, _null);
-}
-
-template <typename E, typename S> 
-int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size) {
-  S* _null = nullptr;
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null);
-}
-
-template <typename S> 
-int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers) {
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
-}
-
-template <typename E, typename S> 
-int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers) {
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
-}
-
-template <typename E, typename S> 
-int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers) {
-  return evaluate(d_out, d_coefficients, d_domain, domain_size, n, true, coset_powers);
-}
-
-template <typename E, typename S> 
-int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers) {
-  return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, true, coset_powers);
-}
-#endif
--- a/icicle/appUtils/ntt/lde.cuh
+++ b/icicle/appUtils/ntt/lde.cuh
@@ -1,46 +0,0 @@
-#ifndef LDE_H
-#define LDE_H
-#pragma once
-
-template <typename S> 
-int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n);
-
-template <typename S> 
-int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
-
-template <typename E, typename S> 
-int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n);
-
-template <typename E, typename S> 
-int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size);
-
-template <typename S> 
-int evaluate_scalars(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
-
-template <typename S> 
-int evaluate_scalars_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, unsigned n, unsigned batch_size);
-
-template <typename E, typename S> 
-int evaluate_points(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size, unsigned n);
-
-template <typename E, typename S> 
-int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain, 
-                          unsigned domain_size, unsigned n, unsigned batch_size);
-
-template <typename S> 
-int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
-                              unsigned domain_size, unsigned n, S* coset_powers);
-
-template <typename S>                               
-int evaluate_scalars_on_coset_batch(S* d_out, S* d_coefficients, S* d_domain, unsigned domain_size, 
-                                    unsigned n, unsigned batch_size, S* coset_powers);
-
-template <typename E, typename S> 
-int evaluate_points_on_coset(E* d_out, E* d_coefficients, S* d_domain, 
-                             unsigned domain_size, unsigned n, S* coset_powers);
-
-template <typename E, typename S> 
-int evaluate_points_on_coset_batch(E* d_out, E* d_coefficients, S* d_domain, unsigned domain_size,
-                                   unsigned n, unsigned batch_size, S* coset_powers);
-
-#endif
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -0,0 +1,745 @@
+#include "ntt.cuh"
+
+#include <unordered_map>
+#include <vector>
+
+#include "curves/curve_config.cuh"
+#include "utils/sharedmem.cuh"
+#include "utils/utils_kernels.cuh"
+#include "utils/utils.h"
+#include "appUtils/ntt/ntt_impl.cuh"
+
+#include <mutex>
+
+namespace ntt {
+
+  namespace {
+
+    const uint32_t MAX_NUM_THREADS = 512;   // TODO: hotfix - should be 1024, currently limits shared memory size
+    const uint32_t MAX_THREADS_BATCH = 512; // TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
+    const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
+    const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;
+
+    template <typename E>
+    __global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
+    {
+      int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
+      if (threadId < n * batch_size) {
+        int idx = threadId % n;
+        int batch_idx = threadId / n;
+        int idx_reversed = __brev(idx) >> (32 - logn);
+
+        E val = arr[batch_idx * n + idx];
+        if (arr == arr_reversed) { __syncthreads(); } // for in-place (when pointers arr==arr_reversed)
+        arr_reversed[batch_idx * n + idx_reversed] = val;
+      }
+    }
+
+    /**
+     * Bit-reverses a batch of input arrays out-of-place inside GPU.
+     * for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
+     * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
+     * @param arr_in batch of arrays of some object of type T. Should be on GPU.
+     * @param n length of `arr`.
+     * @param logn log(n).
+     * @param batch_size the size of the batch.
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
+     */
+    template <typename E>
+    void reverse_order_batch(E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
+    {
+      int number_of_threads = MAX_THREADS_BATCH;
+      int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
+      reverse_order_kernel<<<number_of_blocks, number_of_threads, 0, stream>>>(arr_in, arr_out, n, logn, batch_size);
+    }
+
+    /**
+     * Bit-reverses an input array out-of-place inside GPU.
+     * for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
+     * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
+     * @param arr_in array of some object of type T of size which is a power of 2. Should be on GPU.
+     * @param n length of `arr`.
+     * @param logn log(n).
+     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
+     */
+    template <typename E>
+    void reverse_order(E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
+    {
+      reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
+    }
+
+    /**
+     * Cooley-Tuckey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr_in input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
+     */
+    template <typename E, typename S>
+    __global__ void ntt_template_kernel_shared_rev(
+      E* __restrict__ arr_in,
+      int n,
+      const S* __restrict__ r_twiddles,
+      int n_twiddles,
+      int max_task,
+      int ss,
+      int logn,
+      E* __restrict__ arr_out)
+    {
+      SharedMemory<E> smem;
+      E* arr = smem.getPointer();
+
+      uint32_t task = blockIdx.x;
+      uint32_t loop_limit = blockDim.x;
+      uint32_t chunks = n / (loop_limit * 2);
+      uint32_t offset = (task / chunks) * n;
+      if (task < max_task) {
+        // flattened loop allows parallel processing
+        uint32_t l = threadIdx.x;
+
+        if (l < loop_limit) {
+#pragma unroll
+          for (; ss < logn; ss++) {
+            int s = logn - ss - 1;
+            bool is_beginning = ss == 0;
+            bool is_end = ss == (logn - 1);
+
+            uint32_t ntw_i = task % chunks;
+
+            uint32_t n_twiddles_div = n_twiddles >> (s + 1);
+
+            uint32_t shift_s = 1 << s;
+            uint32_t shift2_s = 1 << (s + 1);
+
+            l = ntw_i * loop_limit + l; // to l from chunks to full
+
+            uint32_t j = l & (shift_s - 1);               // Equivalent to: l % (1 << s)
+            uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
+            uint32_t oij = i + j;
+            uint32_t k = oij + shift_s;
+
+            S tw = *(r_twiddles + (int)(j * n_twiddles_div));
+
+            E u = is_beginning ? arr_in[offset + oij] : arr[oij];
+            E v = is_beginning ? arr_in[offset + k] : arr[k];
+            if (is_end) {
+              arr_out[offset + oij] = u + v;
+              arr_out[offset + k] = tw * (u - v);
+            } else {
+              arr[oij] = u + v;
+              arr[k] = tw * (u - v);
+            }
+
+            __syncthreads();
+          }
+        }
+      }
+    }
+
+    /**
+     * Cooley-Tuckey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr_in input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     * @param arr_out buffer for the output.
+     */
+    template <typename E, typename S>
+    __global__ void ntt_template_kernel_shared(
+      E* __restrict__ arr_in,
+      int n,
+      const S* __restrict__ r_twiddles,
+      int n_twiddles,
+      int max_task,
+      int s,
+      int logn,
+      E* __restrict__ arr_out)
+    {
+      SharedMemory<E> smem;
+      E* arr = smem.getPointer();
+
+      uint32_t task = blockIdx.x;
+      uint32_t loop_limit = blockDim.x;
+      uint32_t chunks = n / (loop_limit * 2);
+      uint32_t offset = (task / chunks) * n;
+      if (task < max_task) {
+        // flattened loop allows parallel processing
+        uint32_t l = threadIdx.x;
+
+        if (l < loop_limit) {
+#pragma unroll
+          for (; s < logn; s++) // TODO: this loop also can be unrolled
+          {
+            uint32_t ntw_i = task % chunks;
+
+            uint32_t n_twiddles_div = n_twiddles >> (s + 1);
+
+            uint32_t shift_s = 1 << s;
+            uint32_t shift2_s = 1 << (s + 1);
+
+            l = ntw_i * loop_limit + l; // to l from chunks to full
+
+            uint32_t j = l & (shift_s - 1);               // Equivalent to: l % (1 << s)
+            uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
+            uint32_t oij = i + j;
+            uint32_t k = oij + shift_s;
+            S tw = *(r_twiddles + (int)(j * n_twiddles_div));
+
+            E u = s == 0 ? arr_in[offset + oij] : arr[oij];
+            E v = s == 0 ? arr_in[offset + k] : arr[k];
+            v = tw * v;
+            if (s == (logn - 1)) {
+              arr_out[offset + oij] = u + v;
+              arr_out[offset + k] = u - v;
+            } else {
+              arr[oij] = u + v;
+              arr[k] = u - v;
+            }
+
+            __syncthreads();
+          }
+        }
+      }
+    }
+
+    /**
+     * Cooley-Tukey NTT.
+     * NOTE! this function assumes that d_twiddles are located in the device memory.
+     * @param arr input array of type E (elements).
+     * @param n length of d_arr.
+     * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
+     * @param n_twiddles length of twiddles, should be negative for intt.
+     * @param max_task max count of parallel tasks.
+     * @param s log2(n) loop index.
+     */
+    template <typename E, typename S>
+    __global__ void
+    ntt_template_kernel(E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
+    {
+      int task = blockIdx.x;
+      int chunks = n / (blockDim.x * 2);
+
+      if (task < max_task) {
+        // flattened loop allows parallel processing
+        uint32_t l = threadIdx.x;
+        uint32_t loop_limit = blockDim.x;
+
+        if (l < loop_limit) {
+          uint32_t ntw_i = task % chunks;
+
+          uint32_t shift_s = 1 << s;
+          uint32_t shift2_s = 1 << (s + 1);
+          uint32_t n_twiddles_div = n_twiddles >> (s + 1);
+
+          l = ntw_i * blockDim.x + l; // to l from chunks to full
+
+          uint32_t j = l & (shift_s - 1);               // Equivalent to: l % (1 << s)
+          uint32_t i = ((l >> s) * shift2_s) & (n - 1); // (..) % n (assuming n is power of 2)
+          uint32_t k = i + j + shift_s;
+
+          S tw = *(twiddles + (int)(j * n_twiddles_div));
+
+          uint32_t offset = (task / chunks) * n;
+          E u = arr_in[offset + i + j];
+          E v = arr_in[offset + k];
+          if (!rev) v = tw * v;
+          arr_out[offset + i + j] = u + v;
+          v = u - v;
+          arr_out[offset + k] = rev ? tw * v : v;
+        }
+      }
+    }
+
+    /**
+     * NTT/INTT inplace batch
+     * Note: this function does not perform any bit-reverse permutations on its inputs or outputs.
+     * @param d_input Input array
+     * @param n Size of `d_input`
+     * @param d_twiddles Twiddles
+     * @param n_twiddles Size of `d_twiddles`
+     * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
+     * @param inverse true for iNTT
+     * @param coset should be array of length n or a nullptr if NTT is not computed on a coset
+     * @param stream CUDA stream
+     * @param is_async if false, perform sync of the supplied CUDA stream at the end of processing
+     * @param d_output Output array
+     */
+    template <typename E, typename S>
+    cudaError_t ntt_inplace_batch_template(
+      E* d_input,
+      int n,
+      S* d_twiddles,
+      int n_twiddles,
+      int batch_size,
+      int logn,
+      bool inverse,
+      bool dit,
+      S* arbitrary_coset,
+      int coset_gen_index,
+      cudaStream_t stream,
+      E* d_output)
+    {
+      CHK_INIT_IF_RETURN();
+
+      bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
+      const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn;
+      int num_threads = max(min(min(n / 2, MAX_THREADS_BATCH), 1 << (log2_shmem_elems - 1)), 1);
+      const int chunks = max(int((n / 2) / num_threads), 1);
+      const int total_tasks = batch_size * chunks;
+      int num_blocks = total_tasks;
+      const int shared_mem = 2 * num_threads * sizeof(E); // TODO: calculator, as shared mem size may be more efficient
+                                                          // less then max to allow more concurrent blocks on SM
+      const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
+                                                   : 0; // TODO: shared memory support only for types <= 32 bytes
+      int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1);
+      int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset;
+
+      if (inverse) {
+        d_twiddles = d_twiddles + n_twiddles;
+        n_twiddles = -n_twiddles;
+      }
+
+      bool is_on_coset = (coset_gen_index != 0) || arbitrary_coset;
+      bool direct_coset = (!inverse && is_on_coset);
+      if (direct_coset)
+        utils_internal::BatchMulKernel<E, S><<<num_blocks_coset, num_threads_coset, 0, stream>>>(
+          d_input, n, batch_size, arbitrary_coset ? arbitrary_coset : d_twiddles, arbitrary_coset ? 1 : coset_gen_index,
+          n_twiddles, logn, dit, d_output);
+
+      if (dit) {
+        if (is_shared_mem_enabled)
+          ntt_template_kernel_shared<<<num_blocks, num_threads, shared_mem, stream>>>(
+            direct_coset ? d_output : d_input, 1 << logn_shmem, d_twiddles, n_twiddles, total_tasks, 0, logn_shmem,
+            d_output);
+
+        for (int s = logn_shmem; s < logn; s++) // TODO: this loop also can be unrolled
+        {
+          ntt_template_kernel<E, S><<<num_blocks, num_threads, 0, stream>>>(
+            (direct_coset || (s > 0)) ? d_output : d_input, n, d_twiddles, n_twiddles, total_tasks, s, false, d_output);
+        }
+      } else {
+        for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
+        {
+          ntt_template_kernel<<<num_blocks, num_threads, 0, stream>>>(
+            (direct_coset || (s < logn - 1)) ? d_output : d_input, n, d_twiddles, n_twiddles, total_tasks, s, true,
+            d_output);
+        }
+
+        if (is_shared_mem_enabled)
+          ntt_template_kernel_shared_rev<<<num_blocks, num_threads, shared_mem, stream>>>(
+            (direct_coset || (logn > logn_shmem)) ? d_output : d_input, 1 << logn_shmem, d_twiddles, n_twiddles,
+            total_tasks, 0, logn_shmem, d_output);
+      }
+
+      if (inverse) {
+        if (is_on_coset)
+          utils_internal::BatchMulKernel<E, S><<<num_blocks_coset, num_threads_coset, 0, stream>>>(
+            d_output, n, batch_size, arbitrary_coset ? arbitrary_coset : d_twiddles,
+            arbitrary_coset ? 1 : -coset_gen_index, -n_twiddles, logn, !dit, d_output);
+
+        utils_internal::NormalizeKernel<E, S>
+          <<<num_blocks_coset, num_threads_coset, 0, stream>>>(d_output, S::inv_log_size(logn), n * batch_size);
+      }
+
+      return CHK_LAST();
+    }
+
+  } // namespace
+
+  /**
+   * @struct Domain
+   * Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
+   * Twiddle factors are private, static and can only be set using [InitDomain](@ref InitDomain) function.
+   * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
+   * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
+   */
+  template <typename S>
+  class Domain
+  {
+    // Mutex for protecting access to the domain/device container array
+    static inline std::mutex device_domain_mutex;
+    // The domain-per-device container - assumption is InitDomain is called once per device per program.
+
+    int max_size = 0;
+    int max_log_size = 0;
+    S* twiddles = nullptr;
+    bool initialized = false; // protection for multi-threaded case
+    std::unordered_map<S, int> coset_index = {};
+
+    S* internal_twiddles = nullptr; // required by mixed-radix NTT
+    S* basic_twiddles = nullptr;    // required by mixed-radix NTT
+
+    // mixed-radix NTT supports a fast-twiddle option at the cost of additional 4N memory (where N is max NTT size)
+    S* fast_external_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles = nullptr;     // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles = nullptr;        // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_external_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_internal_twiddles_inv = nullptr; // required by mixed-radix NTT (fast-twiddles mode)
+    S* fast_basic_twiddles_inv = nullptr;    // required by mixed-radix NTT (fast-twiddles mode)
+
+  public:
+    template <typename U>
+    friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
+
+    cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);
+
+    template <typename U, typename E>
+    friend cudaError_t NTT<U, E>(E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
+  };
+
+  template <typename S>
+  static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};
+
+  template <typename S>
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    CHK_INIT_IF_RETURN();
+
+    Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
+
+    // only generate twiddles if they haven't been generated yet
+    // please note that this offers just basic thread-safety,
+    // it's assumed a singleton (non-enforced) that is supposed
+    // to be initialized once per device per program lifetime
+    if (!domain.initialized) {
+      // Mutex is automatically released when lock goes out of scope, even in case of exceptions
+      std::lock_guard<std::mutex> lock(Domain<S>::device_domain_mutex);
+      // double check locking
+      if (domain.initialized) return CHK_LAST(); // another thread is already initializing the domain
+
+      bool found_logn = false;
+      S omega = primitive_root;
+      unsigned omegas_count = S::get_omegas_count();
+      for (int i = 0; i < omegas_count; i++) {
+        omega = S::sqr(omega);
+        if (!found_logn) {
+          ++domain.max_log_size;
+          found_logn = omega == S::one();
+          if (found_logn) break;
+        }
+      }
+
+      domain.max_size = (int)pow(2, domain.max_log_size);
+      if (omega != S::one()) {
+        THROW_ICICLE_ERR(
+          IcicleError_t::InvalidArgument, "Primitive root provided to the InitDomain function is not in the subgroup");
+      }
+
+      // allocate and calculate twiddles on GPU
+      // Note: radix-2 INTT needs ONE in last element (in addition to first element), therefore have n+1 elements
+      // Managed allocation allows host to read the elements (logn) without copying all (n) TFs back to host
+      CHK_IF_RETURN(cudaMallocManaged(&domain.twiddles, (domain.max_size + 1) * sizeof(S)));
+      CHK_IF_RETURN(generate_external_twiddles_generic(
+        primitive_root, domain.twiddles, domain.internal_twiddles, domain.basic_twiddles, domain.max_log_size,
+        ctx.stream));
+
+      if (fast_twiddles_mode) {
+        // generating fast-twiddles (note that this cost 4N additional memory)
+        CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles, domain.max_size * sizeof(S) * 2, ctx.stream));
+        CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles_inv, domain.max_size * sizeof(S) * 2, ctx.stream));
+
+        // fast-twiddles forward NTT
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
+          primitive_root, domain.fast_external_twiddles, domain.fast_internal_twiddles, domain.fast_basic_twiddles,
+          domain.max_log_size, ctx.stream));
+
+        // fast-twiddles inverse NTT
+        S primitive_root_inv;
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          &primitive_root_inv, &domain.twiddles[domain.max_size - 1], sizeof(S), cudaMemcpyDeviceToHost, ctx.stream));
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
+          primitive_root_inv, domain.fast_external_twiddles_inv, domain.fast_internal_twiddles_inv,
+          domain.fast_basic_twiddles_inv, domain.max_log_size, ctx.stream));
+      }
+      CHK_IF_RETURN(cudaStreamSynchronize(ctx.stream));
+
+      const bool is_map_only_powers_of_primitive_root = true;
+      if (is_map_only_powers_of_primitive_root) {
+        // populate the coset_index map. Note that only powers of the primitive-root are stored (1, PR, PR^2, PR^4, PR^8
+        // etc.)
+        domain.coset_index[S::one()] = 0;
+        for (int i = 0; i < domain.max_log_size; ++i) {
+          const int index = (int)pow(2, i);
+          domain.coset_index[domain.twiddles[index]] = index;
+        }
+      } else {
+        // populate all values
+        for (int i = 0; i < domain.max_size; ++i) {
+          domain.coset_index[domain.twiddles[i]] = i;
+        }
+      }
+      domain.initialized = true;
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename S>
+  cudaError_t Domain<S>::ReleaseDomain(device_context::DeviceContext& ctx)
+  {
+    CHK_INIT_IF_RETURN();
+
+    max_size = 0;
+    max_log_size = 0;
+    cudaFreeAsync(twiddles, ctx.stream);
+    twiddles = nullptr;
+    cudaFreeAsync(internal_twiddles, ctx.stream);
+    internal_twiddles = nullptr;
+    cudaFreeAsync(basic_twiddles, ctx.stream);
+    basic_twiddles = nullptr;
+    coset_index.clear();
+
+    cudaFreeAsync(fast_external_twiddles, ctx.stream);
+    fast_external_twiddles = nullptr;
+    cudaFreeAsync(fast_internal_twiddles, ctx.stream);
+    fast_internal_twiddles = nullptr;
+    cudaFreeAsync(fast_basic_twiddles, ctx.stream);
+    fast_basic_twiddles = nullptr;
+    cudaFreeAsync(fast_external_twiddles_inv, ctx.stream);
+    fast_external_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_internal_twiddles_inv, ctx.stream);
+    fast_internal_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_basic_twiddles_inv, ctx.stream);
+    fast_basic_twiddles_inv = nullptr;
+
+    return CHK_LAST();
+  }
+
+  template <typename S>
+  static bool is_choose_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
+  {
+    const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
+    const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
+    const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
+    if (is_force_radix2) return true;
+
+    const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
+    if (is_user_selected_mixed_radix_alg) return false;
+
+    // Heuristic to automatically select an algorithm
+    // Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
+    // the specific GPU.
+    // the following heuristic is a simplification based on measurements. Users can try both and select the algorithm
+    // based on the specific case via the 'NTTConfig.ntt_algorithm' field
+
+    if (logn >= 16) return false; // mixed-radix is typically faster in those cases
+    if (logn <= 11) return true;  //  radix-2 is typically faster for batch<=256 in those cases
+    const int log_batch = (int)log2(batch_size);
+    return (logn + log_batch <= 18); // almost the cutoff point where both are equal
+  }
+
+  template <typename S, typename E>
+  cudaError_t radix2_ntt(
+    E* d_input,
+    E* d_output,
+    S* twiddles,
+    int ntt_size,
+    int max_size,
+    int batch_size,
+    bool is_inverse,
+    Ordering ordering,
+    S* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream)
+  {
+    CHK_INIT_IF_RETURN();
+
+    const int logn = int(log2(ntt_size));
+
+    bool dit = true;
+    bool reverse_input = false;
+    switch (ordering) {
+    case Ordering::kNN:
+      reverse_input = true;
+      break;
+    case Ordering::kNR:
+    case Ordering::kNM:
+      dit = false;
+      break;
+    case Ordering::kRR:
+      reverse_input = true;
+      dit = false;
+      break;
+    case Ordering::kRN:
+    case Ordering::kMN:
+      dit = true;
+      reverse_input = false;
+    }
+
+    if (reverse_input) reverse_order_batch(d_input, ntt_size, logn, batch_size, cuda_stream, d_output);
+
+    CHK_IF_RETURN(ntt_inplace_batch_template(
+      reverse_input ? d_output : d_input, ntt_size, twiddles, max_size, batch_size, logn, is_inverse, dit,
+      arbitrary_coset, coset_gen_index, cuda_stream, d_output));
+
+    return CHK_LAST();
+  }
+
+  template <typename S, typename E>
+  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
+  {
+    CHK_INIT_IF_RETURN();
+
+    Domain<S>& domain = domains_for_devices<S>[config.ctx.device_id];
+
+    if (size > domain.max_size) {
+      std::ostringstream oss;
+      oss << "NTT size=" << size
+          << " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
+    }
+
+    int logn = int(log2(size));
+    const bool is_size_power_of_two = size == (1 << logn);
+    if (!is_size_power_of_two) {
+      std::ostringstream oss;
+      oss << "NTT size=" << size << " is not supported since it is not a power of two.\n";
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
+    }
+
+    cudaStream_t& stream = config.ctx.stream;
+    size_t batch_size = config.batch_size;
+    size_t input_size_bytes = (size_t)size * batch_size * sizeof(E);
+    bool are_inputs_on_device = config.are_inputs_on_device;
+    bool are_outputs_on_device = config.are_outputs_on_device;
+
+    E* d_input;
+    if (are_inputs_on_device) {
+      d_input = input;
+    } else {
+      CHK_IF_RETURN(cudaMallocAsync(&d_input, input_size_bytes, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream));
+    }
+    E* d_output;
+    if (are_outputs_on_device) {
+      d_output = output;
+    } else {
+      CHK_IF_RETURN(cudaMallocAsync(&d_output, input_size_bytes, stream));
+    }
+
+    S* coset = nullptr;
+    int coset_index = 0;
+    try {
+      coset_index = domain.coset_index.at(config.coset_gen);
+    } catch (...) {
+      // if coset index is not found in the subgroup, compute coset powers on CPU and move them to device
+      std::vector<S> h_coset;
+      h_coset.push_back(S::one());
+      S coset_gen = (dir == NTTDir::kInverse) ? S::inverse(config.coset_gen) : config.coset_gen;
+      for (int i = 1; i < size; i++) {
+        h_coset.push_back(h_coset.at(i - 1) * coset_gen);
+      }
+      CHK_IF_RETURN(cudaMallocAsync(&coset, size * sizeof(S), stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(coset, &h_coset.front(), size * sizeof(S), cudaMemcpyHostToDevice, stream));
+      h_coset.clear();
+    }
+
+    const bool is_radix2_algorithm = is_choose_radix2_algorithm(logn, batch_size, config);
+    const bool is_inverse = dir == NTTDir::kInverse;
+
+    if (is_radix2_algorithm) {
+      CHK_IF_RETURN(ntt::radix2_ntt(
+        d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
+        coset_index, stream));
+    } else {
+      const bool is_on_coset = (coset_index != 0) || coset;
+      const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr) && !is_on_coset;
+      S* twiddles = is_fast_twiddles_enabled
+                      ? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
+                      : domain.twiddles;
+      S* internal_twiddles = is_fast_twiddles_enabled
+                               ? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
+                               : domain.internal_twiddles;
+      S* basic_twiddles = is_fast_twiddles_enabled
+                            ? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
+                            : domain.basic_twiddles;
+
+      CHK_IF_RETURN(ntt::mixed_radix_ntt(
+        d_input, d_output, twiddles, internal_twiddles, basic_twiddles, size, domain.max_log_size, batch_size,
+        is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index, stream));
+    }
+
+    if (!are_outputs_on_device)
+      CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, input_size_bytes, cudaMemcpyDeviceToHost, stream));
+
+    if (coset) CHK_IF_RETURN(cudaFreeAsync(coset, stream));
+    if (!are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_input, stream));
+    if (!are_outputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_output, stream));
+    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
+
+    return CHK_LAST();
+  }
+
+  template <typename S>
+  NTTConfig<S> DefaultNTTConfig()
+  {
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
+    NTTConfig<S> config = {
+      ctx,                // ctx
+      S::one(),           // coset_gen
+      1,                  // batch_size
+      Ordering::kNN,      // ordering
+      false,              // are_inputs_on_device
+      false,              // are_outputs_on_device
+      false,              // is_async
+      NttAlgorithm::Auto, // ntt_algorithm
+    };
+    return config;
+  }
+
+  /**
+   * Extern "C" version of [InitDomain](@ref InitDomain) function with the following
+   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitializeDomain)(
+    curve_config::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    return InitDomain(*primitive_root, ctx, fast_twiddles_mode);
+  }
+
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
+    curve_config::scalar_t* input,
+    int size,
+    NTTDir dir,
+    NTTConfig<curve_config::scalar_t>& config,
+    curve_config::scalar_t* output)
+  {
+    return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, dir, config, output);
+  }
+
+#if defined(ECNTT_DEFINED)
+
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
+   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
+    curve_config::projective_t* input,
+    int size,
+    NTTDir dir,
+    NTTConfig<curve_config::scalar_t>& config,
+    curve_config::projective_t* output)
+  {
+    return NTT<curve_config::scalar_t, curve_config::projective_t>(input, size, dir, config, output);
+  }
+
+#endif
+
+} // namespace ntt
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -1,378 +1,139 @@
-#ifndef NTT
-#define NTT
 #pragma once
+#ifndef NTT_H
+#define NTT_H

-const uint32_t MAX_NUM_THREADS = 1024;
-const uint32_t MAX_THREADS_BATCH = 256;
+#include <cuda_runtime.h>
+
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/error_handler.cuh"
+#include "utils/sharedmem.cuh"
+#include "utils/utils_kernels.cuh"
+#include "utils/utils.h"

 /**
- * Computes the twiddle factors.  
- * Outputs: d_twiddles[i] = omega^i.
- * @param d_twiddles input empty array. 
- * @param n_twiddles number of twiddle factors. 
- * @param omega multiplying factor. 
+ * @namespace ntt
+ * Number Theoretic Transform, or NTT is a version of [fast Fourier
+ * transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) where instead of real or complex numbers, inputs and
+ * outputs belong to certain finite groups or fields. NTT computes the values of a polynomial \f$ p(x) = p_0 + p_1 \cdot
+ * x + \dots + p_{n-1} \cdot x^{n-1} \f$ on special subfields called "roots of unity", or "twiddle factors" (optionally
+ * shifted by an additional element called "coset generator"): \f[ NTT(p) = \{ p(\omega^0), p(\omega^1), \dots,
+ * p(\omega^{n-1}) \} \f] Inverse NTT, or iNTT solves the inverse problem of computing coefficients of \f$ p(x) \f$
+ * given evaluations \f$ \{ p(\omega^0), p(\omega^1), \dots, p(\omega^{n-1}) \} \f$. If not specified otherwise,
+ * \f$ n \f$ is a power of 2.
 */
- template < typename S > __global__ void twiddle_factors_kernel(S * d_twiddles, uint32_t n_twiddles, S omega) {
-  for (uint32_t i = 0; i < n_twiddles; i++) {
-    d_twiddles[i] = S::zero();
-  }
-  d_twiddles[0] = S::one();
-  for (uint32_t i = 0; i < n_twiddles - 1; i++) {
-    d_twiddles[i + 1] = omega * d_twiddles[i];
-  }
-}
+namespace ntt {

-/**
- * Fills twiddles array with twiddle factors. 
- * @param twiddles input empty array. 
- * @param n_twiddles number of twiddle factors. 
- * @param omega multiplying factor. 
- */
- template < typename S > S * fill_twiddle_factors_array(uint32_t n_twiddles, S omega) {
-  size_t size_twiddles = n_twiddles * sizeof(S);
-  S * d_twiddles;
-  cudaMalloc( & d_twiddles, size_twiddles);
-  twiddle_factors_kernel<S> <<< 1, 1 >>> (d_twiddles, n_twiddles, omega);
-  return d_twiddles;
-}
+  /**
+   * Generate a domain that supports all NTTs of sizes under a certain threshold. Note that the this function might
+   * be expensive, so if possible it should be called before all time-critical operations.
+   * It's assumed that during program execution only the coset generator might change, but twiddles stay fixed, so
+   * they are initialized at the first call of this function and don't change afterwards.
+   * @param primitive_root Primitive root in field `S` of order \f$ 2^s \f$. This should be the smallest power-of-2
+   * order that's large enough to support any NTT you might want to perform.
+   * @param ctx Details related to the device such as its id and stream id.
+   * @param fast_twiddles_mode A mode where more memory is allocated for twiddle factors in exchange for faster compute.
+   * In this mode need additional 4N memory when N is the largest NTT size to be supported (which is derived by the
+   * primitive_root).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  template <typename S>
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

-/**
- * Returns the bit reversed order of a number. 
- * for example: on inputs num = 6 (110 in binary) and logn = 3
- * the function should return 3 (011 in binary.)
- * @param num some number with bit representation of size logn.
- * @param logn length of bit representation of `num`.
- * @return bit reveresed order or `num`.
- */
-__device__ __host__ uint32_t reverseBits(uint32_t num, uint32_t logn) {
-  unsigned int reverse_num = 0;
-  for (uint32_t i = 0; i < logn; i++) {
-    if ((num & (1 << i))) reverse_num |= 1 << ((logn - 1) - i);
-  }
-  return reverse_num;
-}
+  /**
+   * @enum NTTDir
+   * Whether to perform normal forward NTT, or inverse NTT (iNTT). Mathematically, forward NTT computes polynomial
+   * evaluations from coefficients while inverse NTT computes coefficients from evaluations.
+   */
+  enum class NTTDir { kForward, kInverse };

-/**
- * Returns the bit reversal ordering of the input array.
- * for example: on input ([a[0],a[1],a[2],a[3]], 4, 2) it returns
- * [a[0],a[3],a[2],a[1]] (elements in indices 3,1 swhich places).
- * @param arr array of some object of type T of size which is a power of 2. 
- * @param n length of `arr`.
- * @param logn log(n).
- * @return A new array which is the bit reversed version of input array. 
- */
-template < typename T > T * template_reverse_order(T * arr, uint32_t n, uint32_t logn) {
-  T * arrReversed = new T[n];
-  for (uint32_t i = 0; i < n; i++) {
-    uint32_t reversed = reverseBits(i, logn);
-    arrReversed[i] = arr[reversed];
-  }
-  return arrReversed;
-}
+  /**
+   * @enum Ordering
+   * How to order inputs and outputs of the NTT. If needed, use this field to specify decimation: decimation in time
+   * (DIT) corresponds to `Ordering::kRN` while decimation in frequency (DIF) to `Ordering::kNR`. Also, to specify
+   * butterfly to be used, select `Ordering::kRN` for Cooley-Tukey and `Ordering::kNR` for Gentleman-Sande. There's
+   * no implication that a certain decimation or butterfly will actually be used under the hood, this is just for
+   * compatibility with codebases that use "decimation" and "butterfly" to denote ordering of inputs and outputs.
+   *
+   * Ordering options are:
+   * - kNN: inputs and outputs are natural-order (example of natural ordering: \f$ \{a_0, a_1, a_2, a_3, a_4, a_5, a_6,
+   * a_7\} \f$).
+   * - kNR: inputs are natural-order and outputs are bit-reversed-order (example of bit-reversed ordering: \f$ \{a_0,
+   * a_4, a_2, a_6, a_1, a_5, a_3, a_7\} \f$).
+   * - kRN: inputs are bit-reversed-order and outputs are natural-order.
+   * - kRR: inputs and outputs are bit-reversed-order.
+   *
+   * Mixed-Radix NTT: digit-reversal is a generalization of bit-reversal where the latter is a special case with 1b
+   * digits. Mixed-radix NTTs of different sizes would generate different reordering of inputs/outputs. Having said
+   * that, for a given size N it is guaranteed that every two mixed-radix NTTs of size N would have the same
+   * digit-reversal pattern. The following orderings kNM and kMN are conceptually like kNR and kRN but for
+   * mixed-digit-reordering. Note that for the cases '(1) NTT, (2) elementwise ops and (3) INTT' kNM and kMN are most
+   * efficient.
+   * Note: kNR, kRN, kRR refer to the radix-2 NTT reversal pattern. Those cases are supported by mixed-radix NTT with
+   * reduced efficiency compared to kNM and kMN.
+   * - kNM: inputs are natural-order and outputs are digit-reversed-order (=mixed).
+   * - kMN: inputs are digit-reversed-order (=mixed) and outputs are natural-order.
+   */
+  enum class Ordering { kNN, kNR, kRN, kRR, kNM, kMN };

-template < typename T > __global__ void reverse_order_kernel(T* arr, T* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size) {
-  int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (threadId < n * batch_size) {
-    int idx = threadId % n;
-    int batch_idx = threadId / n;
-    int idx_reversed = __brev(idx) >> (32 - logn);
-    arr_reversed[batch_idx * n + idx_reversed] = arr[batch_idx * n + idx];
-  }
-}
+  /**
+   * @enum NttAlgorithm
+   * Which NTT algorithm to use. options are:
+   * - Auto: implementation selects automatically based on heuristic. This value is a good default for most cases.
+   * - Radix2: explicitly select radix-2 NTT algorithm
+   * - MixedRadix: explicitly select mixed-radix NTT algorithm
+   */
+  enum class NttAlgorithm { Auto, Radix2, MixedRadix };

-/**
- * Bit-reverses a batch of input arrays in-place inside GPU.
- * for example: on input array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
- * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
- * @param arr batch of arrays of some object of type T. Should be on GPU.
- * @param n length of `arr`.
- * @param logn log(n).
- * @param batch_size the size of the batch.
- */
-template < typename T > void reverse_order_batch(T* arr, uint32_t n, uint32_t logn, uint32_t batch_size) {
-  T* arr_reversed;
-  cudaMalloc(&arr_reversed, n * batch_size * sizeof(T));
-  int number_of_threads = MAX_THREADS_BATCH;
-  int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
-  reverse_order_kernel <<<number_of_blocks, number_of_threads>>> (arr, arr_reversed, n, logn, batch_size);
-  cudaMemcpy(arr, arr_reversed, n * batch_size * sizeof(T), cudaMemcpyDeviceToDevice);
-  cudaFree(arr_reversed);
-}
+  /**
+   * @struct NTTConfig
+   * Struct that encodes NTT parameters to be passed into the [NTT](@ref NTT) function.
+   */
+  template <typename S>
+  struct NTTConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
+    S coset_gen;                       /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()`
+                                        *   (corresponding to no coset being used). */
+    int batch_size;                    /**< The number of NTTs to compute. Default value: 1. */
+    Ordering ordering;          /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value:
+                                 *   `Ordering::kNN`. */
+    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
+    bool is_async;              /**< Whether to run the NTT asynchronously. If set to `true`, the NTT function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the NTT
+                                 *   function will block the current CPU thread. */
+    NttAlgorithm ntt_algorithm; /**< Explicitly select the NTT algorithm. Default value: Auto (the implementation
+                             selects radix-2 or mixed-radix algorithm based on heuristics). */
+  };

-/**
- * Bit-reverses an input array in-place inside GPU.
- * for example: on array ([a[0],a[1],a[2],a[3]], 4, 2) it returns
- * [a[0],a[3],a[2],a[1]] (elements at indices 3 and 1 swhich places).
- * @param arr array of some object of type T of size which is a power of 2. Should be on GPU.
- * @param n length of `arr`.
- * @param logn log(n).
- */
-template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn) {
-  reverse_order_batch(arr, n, logn, 1);
-}
+  /**
+   * A function that returns the default value of [NTTConfig](@ref NTTConfig) for the [NTT](@ref NTT) function.
+   * @return Default value of [NTTConfig](@ref NTTConfig).
+   */
+  template <typename S>
+  NTTConfig<S> DefaultNTTConfig();

-/**
- * Cooley-Tukey butterfly kernel. 
- * @param arr array of objects of type E (elements). 
- * @param twiddles array of twiddle factors of type S (scalars). 
- * @param n size of arr. 
- * @param n_twiddles size of omegas.
- * @param m "pair distance" - indicate distance of butterflies inputs.
- * @param i Cooley-Tukey FFT stage number.
- * @param max_thread_num maximal number of threads in stage. 
- */
-template < typename E, typename S > __global__ void template_butterfly_kernel(E * arr, S * twiddles, uint32_t n, uint32_t n_twiddles, uint32_t m, uint32_t i, uint32_t max_thread_num) {
-  int j = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (j < max_thread_num) {
-    uint32_t g = j * (n / m);
-    uint32_t k = i + j + (m >> 1);
-    E u = arr[i + j];
-    E v = twiddles[g * n_twiddles / n] * arr[k];
-    arr[i + j] = u + v;
-    arr[k] = u - v;
-  }
-}
+  /**
+   * A function that computes NTT or iNTT in-place. It's necessary to call [InitDomain](@ref InitDomain) with an
+   * appropriate primitive root before calling this function (only one call to `InitDomain` should suffice for all
+   * NTTs).
+   * @param input Input of the NTT. Length of this array needs to be \f$ size \cdot config.batch\_size \f$. Note
+   * that if inputs are in Montgomery form, the outputs will be as well and vice-versa: non-Montgomery inputs produce
+   * non-Montgomety outputs.
+   * @param size NTT size. If a batch of NTTs (which all need to have the same size) is computed, this is the size
+   * of 1 NTT, so it must equal the size of `inout` divided by `config.batch_size`.
+   * @param dir Whether to compute forward or inverse NTT.
+   * @param config [NTTConfig](@ref NTTConfig) used in this NTT.
+   * @param output Buffer for the output of the NTT. Should be of the same size as `input`.
+   * @tparam E The type of inputs and outputs (i.e. coefficients \f$ \{p_i\} \f$ and values \f$ p(x) \f$). Must be a
+   * group.
+   * @tparam S The type of "twiddle factors" \f$ \{ \omega^i \} \f$. Must be a field. Often (but not always) `S=E`.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  template <typename S, typename E>
+  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);

-/**
- * Multiply the elements of an input array by a scalar in-place.
- * @param arr input array.
- * @param n size of arr.
- * @param n_inv scalar of type S (scalar).
- */
-template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) {
-    arr[tid] = scalar * arr[tid];
-  }
-}
-
-/**
- * Cooley-Tukey NTT.
- * NOTE! this function assumes that d_arr and d_twiddles are located in the device memory.
- * @param d_arr input array of type E (elements) allocated on the device memory.
- * @param n length of d_arr.
- * @param logn log(n).
- * @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
- * @param n_twiddles length of d_twiddles.
- */
-template < typename E, typename S > void template_ntt_on_device_memory(E * d_arr, uint32_t n, uint32_t logn, S * d_twiddles, uint32_t n_twiddles) {
-  uint32_t m = 2;
-  for (uint32_t s = 0; s < logn; s++) {
-    for (uint32_t i = 0; i < n; i += m) {
-        uint32_t shifted_m = m >> 1;
-        uint32_t number_of_threads = MAX_NUM_THREADS ^ ((shifted_m ^ MAX_NUM_THREADS) & -(shifted_m < MAX_NUM_THREADS));
-        uint32_t number_of_blocks = shifted_m / MAX_NUM_THREADS + 1;
-        template_butterfly_kernel < E, S > <<< number_of_threads, number_of_blocks >>> (d_arr, d_twiddles, n, n_twiddles, m, i, m >> 1);
-    }
-    m <<= 1;
-  }
-}
-
-/**
- * Cooley-Tukey NTT. 
- * NOTE! this function assumes that d_twiddles are located in the device memory.
- * @param arr input array of type E (elements). 
- * @param n length of d_arr.
- * @param d_twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2). 
- * @param n_twiddles length of d_twiddles. 
- * @param inverse indicate if the result array should be normalized by n^(-1). 
- */
-template < typename E, typename S > E * ntt_template(E * arr, uint32_t n, S * d_twiddles, uint32_t n_twiddles, bool inverse) {
-  uint32_t logn = uint32_t(log(n) / log(2));
-  size_t size_E = n * sizeof(E);
-  E * arrReversed = template_reverse_order < E > (arr, n, logn);
-  E * d_arrReversed;
-  cudaMalloc( & d_arrReversed, size_E);
-  cudaMemcpy(d_arrReversed, arrReversed, size_E, cudaMemcpyHostToDevice);
-  template_ntt_on_device_memory < E, S > (d_arrReversed, n, logn, d_twiddles, n_twiddles);
-  if (inverse) {
-    int NUM_THREADS = MAX_NUM_THREADS;
-    int NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
-    template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arrReversed, n, S::inv_log_size(logn));
-  }
-  cudaMemcpy(arrReversed, d_arrReversed, size_E, cudaMemcpyDeviceToHost);
-  cudaFree(d_arrReversed);
-  return arrReversed;
-}
-
-/**
- * Cooley-Tukey (scalar) NTT. 
- * @param arr input array of type E (element). 
- * @param n length of d_arr.
- * @param inverse indicate if the result array should be normalized by n^(-1). 
- */
- template<typename E,typename S> uint32_t ntt_end2end_template(E * arr, uint32_t n, bool inverse) {
-  uint32_t logn = uint32_t(log(n) / log(2));
-  uint32_t n_twiddles = n; 
-  S * twiddles = new S[n_twiddles];
-  S * d_twiddles;
-  if (inverse){
-    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn));
-  } else{
-    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn));
-  }
-  E * result = ntt_template < E, S > (arr, n, d_twiddles, n_twiddles, inverse);
-  for(int i = 0; i < n; i++){
-    arr[i] = result[i]; 
-  }
-  cudaFree(d_twiddles);
-  return 0; // TODO add
-}
-
-
-/**
- * Returens the bit reversal ordering of the input array according to the batches *in place*. 
- * The assumption is that arr is divided into N tasks of size n. 
- * Tasks indicates the index of the task (out of N). 
- * @param arr input array of type T.   
- * @param n length of arr.
- * @param logn log(n).
- * @param task log(n).
- */
-template < typename T > __device__ __host__ void reverseOrder_batch(T * arr, uint32_t n, uint32_t logn, uint32_t task) {
-  for (uint32_t i = 0; i < n; i++) {
-    uint32_t reversed = reverseBits(i, logn);
-    if (reversed > i) {
-      T tmp = arr[task * n + i];
-      arr[task * n + i] = arr[task * n + reversed];
-      arr[task * n + reversed] = tmp;
-    }
-  }
-}
-
-
-/**
- * Cooley-Tukey butterfly kernel. 
- * @param arr array of objects of type E (elements). 
- * @param twiddles array of twiddle factors of type S (scalars). 
- * @param n size of arr. 
- * @param n_twiddles size of omegas.
- * @param m "pair distance" - indicate distance of butterflies inputs.
- * @param i Cooley-TUckey FFT stage number.
- * @param offset offset corr. to the specific taks (in batch).  
- */
-template < typename E, typename S > __device__ __host__ void butterfly(E * arrReversed, S * omegas, uint32_t n, uint32_t n_omegas, uint32_t m, uint32_t i, uint32_t j, uint32_t offset) {
-  uint32_t g = j * (n / m);
-  uint32_t k = i + j + (m >> 1);
-  E u = arrReversed[offset + i + j];
-  E v = omegas[g * n_omegas / n] * arrReversed[offset + k];
-  arrReversed[offset + i + j] = u + v;
-  arrReversed[offset + k] = u - v;
-}
-
-/**
- * Cooley-Tukey NTT. 
- * NOTE! this function assumes that d_twiddles are located in the device memory.
- * @param arr input array of type E (elements). 
- * @param n length of d_arr.
- * @param twiddles twiddle factors of type S (scalars) array allocated on the device memory (must be a power of 2).
- * @param n_twiddles length of twiddles.
- * @param max_task max count of parallel tasks.
- * @param s log2(n) loop index.
- */
-template <typename E, typename S>
-__global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_twiddles, uint32_t max_task, uint32_t s, bool rev)
-{
-  int task = blockIdx.x;
-  int chunks = n / (blockDim.x * 2);
-
-  if (task < max_task)
-  {
-    // flattened loop allows parallel processing
-    uint32_t l = threadIdx.x;
-    uint32_t loop_limit = blockDim.x;
-
-    if (l < loop_limit)
-    {
-      uint32_t ntw_i = task % chunks;
-
-      uint32_t shift_s = 1 << s;
-      uint32_t shift2_s = 1 << (s + 1);
-      uint32_t n_twiddles_div = n_twiddles >> (s + 1);
-
-      l = ntw_i * blockDim.x + l; //to l from chunks to full
-
-      uint32_t j = l & (shift_s - 1); // Equivalent to: l % (1 << s)
-      uint32_t i = ((l / shift_s) * shift2_s) % n;
-      uint32_t k = i + j + shift_s;
-
-      uint32_t offset = (task / chunks) * n;
-      E u = arr[offset + i + j];
-      E v = rev ? arr[offset + k] : twiddles[j * n_twiddles_div] * arr[offset + k];
-      arr[offset + i + j] = u + v;
-      arr[offset + k] = u - v;
-      if (rev)
-        arr[offset + k] = twiddles[j * n_twiddles_div] * arr[offset + k];
-    }
-  }
-}
-
-
-/**
- * Cooley-Tukey NTT.
- * NOTE! this function assumes that d_twiddles are located in the device memory.
- * @param arr input array of type E (elements).
- * @param n length of arr.
- * @param logn log2(n).
- * @param max_task max count of parallel tasks.
- */
-template <typename E, typename S>
-__global__ void ntt_template_kernel_rev_ord(E *arr, uint32_t n, uint32_t logn, uint32_t max_task)
-{
-  int task = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  if (task < max_task)
-  {
-    reverseOrder_batch<E>(arr, n, logn, task);
-  }
-}
-
-/**
- * Cooley-Tukey (scalar) NTT.
- * This is a bached version - meaning it assumes than the input array 
- * consists of N arrays of size n. The function performs n-size NTT on each small array.
- * @param arr input array of type BLS12_381::scalar_t. 
- * @param arr_size number of total elements = n * N.  
- * @param n size of batch.
- * @param inverse indicate if the result array should be normalized by n^(-1). 
- */
- template <typename E, typename S> uint32_t ntt_end2end_batch_template(E * arr, uint32_t arr_size, uint32_t n, bool inverse) {
-  int batches = int(arr_size / n);
-  uint32_t logn = uint32_t(log(n) / log(2));
-  uint32_t n_twiddles = n; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order. 
-  size_t size_E = arr_size * sizeof(E);
-  S * d_twiddles;
-  if (inverse){
-    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega_inv(logn));
-  } else{
-    d_twiddles = fill_twiddle_factors_array(n_twiddles, S::omega(logn));
-  }
-  E * d_arr;
-  cudaMalloc( & d_arr, size_E);
-  cudaMemcpy(d_arr, arr, size_E, cudaMemcpyHostToDevice);
-  int NUM_THREADS = MAX_THREADS_BATCH;
-  int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS;
-  ntt_template_kernel_rev_ord<E, S><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, logn, batches);
-
-  NUM_THREADS = min(n / 2, MAX_THREADS_BATCH);
-  int chunks = max(int((n / 2) / NUM_THREADS), 1);
-  int total_tasks = batches * chunks;
-  NUM_BLOCKS = total_tasks;
-
-  for (uint32_t s = 0; s < logn; s++) //TODO: this loop also can be unrolled
-  {
-    ntt_template_kernel<E, S><<<NUM_BLOCKS, NUM_THREADS>>>(d_arr, n, d_twiddles, n_twiddles, total_tasks, s, false);
-  }
-  if (inverse == true)
-  {
-    NUM_THREADS = MAX_NUM_THREADS;
-    NUM_BLOCKS = (arr_size + NUM_THREADS - 1) / NUM_THREADS;
-    template_normalize_kernel < E, S > <<< NUM_THREADS, NUM_BLOCKS >>> (d_arr, arr_size, S::inv_log_size(logn));
-  }
-  cudaMemcpy(arr, d_arr, size_E, cudaMemcpyDeviceToHost);
-  cudaFree(d_arr);
-  cudaFree(d_twiddles);
-  return 0; 
-}
+} // namespace ntt

 #endif
--- a/icicle/appUtils/ntt/ntt_impl.cuh
+++ b/icicle/appUtils/ntt/ntt_impl.cuh
@@ -0,0 +1,46 @@
+#pragma once
+#ifndef _NTT_IMPL_H
+#define _NTT_IMPL_H
+
+#include <stdint.h>
+#include "appUtils/ntt/ntt.cuh" // for enum Ordering
+
+namespace ntt {
+
+  template <typename S>
+  cudaError_t generate_external_twiddles_generic(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
+  template <typename S>
+  cudaError_t generate_external_twiddles_fast_twiddles_mode(
+    const S& basic_root,
+    S* external_twiddles,
+    S*& internal_twiddles,
+    S*& basic_twiddles,
+    uint32_t log_size,
+    cudaStream_t& stream);
+
+  template <typename E, typename S>
+  cudaError_t mixed_radix_ntt(
+    E* d_input,
+    E* d_output,
+    S* external_twiddles,
+    S* internal_twiddles,
+    S* basic_twiddles,
+    int ntt_size,
+    int max_logn,
+    int batch_size,
+    bool is_inverse,
+    bool fast_tw,
+    Ordering ordering,
+    S* arbitrary_coset,
+    int coset_gen_index,
+    cudaStream_t cuda_stream);
+
+} // namespace ntt
+#endif //_NTT_IMPL_H
--- a/icicle/appUtils/ntt/tests/verification.cu
+++ b/icicle/appUtils/ntt/tests/verification.cu
@@ -0,0 +1,173 @@
+
+#define CURVE_ID BLS12_381
+
+#include "primitives/field.cuh"
+#include "primitives/projective.cuh"
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "curves/curve_config.cuh"
+#include "ntt/ntt.cu"
+#include "ntt/ntt_impl.cuh"
+#include <memory>
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_data;
+#include "kernel_ntt.cu"
+
+void random_samples(test_data* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++)
+    res[i] = i < 1000 ? test_data::rand_host() : res[i - 1000];
+}
+
+void incremental_values(test_scalar* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++) {
+    res[i] = i ? res[i - 1] + test_scalar::one() : test_scalar::zero();
+  }
+}
+
+int main(int argc, char** argv)
+{
+  cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
+  float icicle_time, new_time;
+
+  int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
+  int NTT_SIZE = 1 << NTT_LOG_SIZE;
+  bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
+  int INV = (argc > 3) ? atoi(argv[3]) : true;
+  int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 1;
+  int COSET_IDX = (argc > 5) ? atoi(argv[5]) : 0;
+  const ntt::Ordering ordering = (argc > 6) ? ntt::Ordering(atoi(argv[6])) : ntt::Ordering::kNN;
+  bool FAST_TW = (argc > 7) ? atoi(argv[7]) : true;
+
+  // Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
+  const char* ordering_str = ordering == ntt::Ordering::kNN   ? "NN"
+                             : ordering == ntt::Ordering::kNR ? "NR"
+                             : ordering == ntt::Ordering::kRN ? "RN"
+                             : ordering == ntt::Ordering::kRR ? "RR"
+                             : ordering == ntt::Ordering::kNM ? "NM"
+                                                              : "MN";
+
+  printf(
+    "running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, coset-idx=%d, ordering=%s, fast_tw=%d\n", NTT_LOG_SIZE,
+    INPLACE, INV, BATCH_SIZE, COSET_IDX, ordering_str, FAST_TW);
+
+  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)
+
+  // init domain
+  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
+  ntt_config.ordering = ordering;
+  ntt_config.are_inputs_on_device = true;
+  ntt_config.are_outputs_on_device = true;
+  ntt_config.batch_size = BATCH_SIZE;
+
+  CHK_IF_RETURN(cudaEventCreate(&icicle_start));
+  CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
+  CHK_IF_RETURN(cudaEventCreate(&new_start));
+  CHK_IF_RETURN(cudaEventCreate(&new_stop));
+
+  auto start = std::chrono::high_resolution_clock::now();
+  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  ntt::InitDomain(basic_root, ntt_config.ctx, FAST_TW);
+  auto stop = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+  std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
+
+  // cpu allocation
+  auto CpuScalars = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
+  auto CpuOutputOld = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
+  auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);
+
+  // gpu allocation
+  test_data *GpuScalars, *GpuOutputOld, *GpuOutputNew;
+  CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
+  CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
+  CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
+
+  // init inputs
+  // incremental_values(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
+  random_samples(CpuScalars.get(), NTT_SIZE * BATCH_SIZE);
+  CHK_IF_RETURN(
+    cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));
+
+  // inplace
+  if (INPLACE) {
+    CHK_IF_RETURN(
+      cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
+  }
+
+  for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
+    ntt_config.coset_gen = ntt_config.coset_gen * basic_root;
+  }
+
+  auto benchmark = [&](bool is_print, int iterations) -> cudaError_t {
+    // NEW
+    CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
+    ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
+    for (size_t i = 0; i < iterations; i++) {
+      CHK_IF_RETURN(ntt::NTT(
+        INPLACE ? GpuOutputNew : GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config,
+        GpuOutputNew));
+    }
+    CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
+    if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }
+
+    // OLD
+    CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
+    ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
+    for (size_t i = 0; i < iterations; i++) {
+      CHK_IF_RETURN(
+        ntt::NTT(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
+    }
+    CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
+    CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
+    if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }
+
+    if (is_print) {
+      printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
+      printf("New Runtime=%0.3f MS\n", new_time / iterations);
+    }
+
+    return CHK_LAST();
+  };
+
+  CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
+  int count = INPLACE ? 1 : 10;
+  if (INPLACE) {
+    CHK_IF_RETURN(
+      cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
+  }
+  CHK_IF_RETURN(benchmark(true /*=print*/, count));
+
+  // verify
+  CHK_IF_RETURN(
+    cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
+  CHK_IF_RETURN(
+    cudaMemcpy(CpuOutputOld.get(), GpuOutputOld, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
+
+  bool success = true;
+  for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
+    if (CpuOutputNew[i] != CpuOutputOld[i]) {
+      success = false;
+      // std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
+      break;
+    } else {
+      // std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
+      // break;
+    }
+  }
+  const char* success_str = success ? "SUCCESS!" : "FAIL!";
+  printf("%s\n", success_str);
+
+  CHK_IF_RETURN(cudaFree(GpuScalars));
+  CHK_IF_RETURN(cudaFree(GpuOutputOld));
+  CHK_IF_RETURN(cudaFree(GpuOutputNew));
+
+  return CHK_LAST();
+}
--- a/icicle/appUtils/ntt/thread_ntt.cu
+++ b/icicle/appUtils/ntt/thread_ntt.cu
@@ -0,0 +1,627 @@
+#ifndef T_NTT
+#define T_NTT
+#pragma once
+
+#include <stdio.h>
+#include <stdint.h>
+#include "curves/curve_config.cuh"
+
+struct stage_metadata {
+  uint32_t th_stride;
+  uint32_t ntt_block_size;
+  uint32_t ntt_block_id;
+  uint32_t ntt_inp_id;
+};
+
+#define STAGE_SIZES_DATA                                                                                               \
+  {                                                                                                                    \
+    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
+      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
+      {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0},            \
+      {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
+      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6},            \
+      {6, 6, 6, 6, 6},                                                                                                 \
+  }
+uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA;
+__device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA;
+
+// construction for fast-twiddles
+uint32_t constexpr STAGE_PREV_SIZES[31] = {0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  5,  6,  6,  9,  9, 10,
+                                           11, 11, 12, 15, 15, 16, 16, 18, 18, 20, 21, 21, 22, 23, 24};
+
+#define STAGE_SIZES_DATA_FAST_TW                                                                                       \
+  {                                                                                                                    \
+    {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0},              \
+      {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0},            \
+      {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0},            \
+      {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0},            \
+      {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6},            \
+      {6, 6, 6, 6, 6},                                                                                                 \
+  }
+uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
+__device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;
+
+template <typename E, typename S>
+class NTTEngine
+{
+public:
+  E X[8];
+  S WB[3];
+  S WI[7];
+  S WE[8];
+
+  __device__ __forceinline__ void loadBasicTwiddles(S* basic_twiddles)
+  {
+#pragma unroll
+    for (int i = 0; i < 3; i++) {
+      WB[i] = basic_twiddles[i];
+    }
+  }
+
+  __device__ __forceinline__ void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
+  {
+#pragma unroll
+    for (int i = 0; i < 3; i++) {
+      WB[i] = basic_twiddles[inv ? i + 3 : i];
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddles64(S* data, bool stride)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddles32(S* data, bool stride)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddles16(S* data, bool stride)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
+      WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
+      WI[i] = data[(inv && exp) ? 64 - exp : exp];
+    }
+  }
+
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
+  {
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
+      WI[i] = data[(inv && exp) ? 64 - exp : exp];
+    }
+  }
+
+  __device__ __forceinline__ void
+  loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
+  {
+    data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
+    }
+  }
+
+  __device__ __forceinline__ void
+  loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
+  {
+    data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));
+
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void
+  loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
+  {
+    data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));
+
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric64(
+    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
+  {
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
+                     << (tw_log_size - tw_log_order - 6);
+      WE[i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
+    }
+  }
+
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric32(
+    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
+  {
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
+                       << (tw_log_size - tw_log_order - 5);
+        WE[4 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric16(
+    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
+  {
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
+                       << (tw_log_size - tw_log_order - 4);
+        WE[2 * j + i] = data[(inv && exp) ? ((1 << tw_log_size) - exp) : exp];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void loadGlobalData(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
+    }
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      X[i] = data[s_meta.th_stride * i * data_stride];
+    }
+  }
+
+  __device__ __forceinline__ void storeGlobalData(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
+    }
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      data[s_meta.th_stride * i * data_stride] = X[i];
+    }
+  }
+
+  __device__ __forceinline__ void loadGlobalData32(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
+    }
+
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        X[4 * j + i] = data[(8 * i + j) * data_stride];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void storeGlobalData32(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
+    }
+
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        data[(8 * i + j) * data_stride] = X[4 * j + i];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void loadGlobalData16(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
+    }
+
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        X[2 * j + i] = data[(8 * i + j) * data_stride];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void storeGlobalData16(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
+  {
+    if (strided) {
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+    } else {
+      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
+    }
+
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        data[(8 * i + j) * data_stride] = X[2 * j + i];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void ntt4_2()
+  {
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
+    }
+  }
+
+  __device__ __forceinline__ void ntt2_4()
+  {
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      ntt2(X[2 * i], X[2 * i + 1]);
+    }
+  }
+
+  __device__ __forceinline__ void ntt2(E& X0, E& X1)
+  {
+    E T;
+
+    T = X0 + X1;
+    X1 = X0 - X1;
+    X0 = T;
+  }
+
+  __device__ __forceinline__ void ntt4(E& X0, E& X1, E& X2, E& X3)
+  {
+    E T;
+
+    T = X0 + X2;
+    X2 = X0 - X2;
+    X0 = X1 + X3;
+    X1 = X1 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
+
+    X1 = X1 * WB[0];
+
+    X3 = X2 - X1;
+    X1 = X2 + X1;
+    X2 = T - X0;
+    X0 = T + X0;
+  }
+
+  // rbo version
+  __device__ __forceinline__ void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
+  {
+    E T;
+
+    T = X0 - X1;
+    X0 = X0 + X1;
+    X1 = X2 + X3;
+    X3 = X2 - X3; // T has X0, X0 has X1, X2 has X2, X1 has X3
+
+    X3 = X3 * WB[0];
+
+    X2 = X0 - X1;
+    X0 = X0 + X1;
+    X1 = T + X3;
+    X3 = T - X3;
+  }
+
+  __device__ __forceinline__ void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
+  {
+    E T;
+
+    // out of 56,623,104 possible mappings, we have:
+    T = X3 - X7;
+    X7 = X3 + X7;
+    X3 = X1 - X5;
+    X5 = X1 + X5;
+    X1 = X2 + X6;
+    X2 = X2 - X6;
+    X6 = X0 + X4;
+    X0 = X0 - X4;
+
+    T = T * WB[1];
+    X2 = X2 * WB[1];
+
+    X4 = X6 + X1;
+    X6 = X6 - X1;
+    X1 = X3 + T;
+    X3 = X3 - T;
+    T = X5 + X7;
+    X5 = X5 - X7;
+    X7 = X0 + X2;
+    X0 = X0 - X2;
+
+    X1 = X1 * WB[0];
+    X5 = X5 * WB[1];
+    X3 = X3 * WB[2];
+
+    X2 = X6 + X5;
+    X6 = X6 - X5;
+    X5 = X7 - X1;
+    X1 = X7 + X1;
+    X7 = X0 - X3;
+    X3 = X0 + X3;
+    X0 = X4 + T;
+    X4 = X4 - T;
+  }
+
+  __device__ __forceinline__ void ntt8win()
+  {
+    E T;
+
+    T = X[3] - X[7];
+    X[7] = X[3] + X[7];
+    X[3] = X[1] - X[5];
+    X[5] = X[1] + X[5];
+    X[1] = X[2] + X[6];
+    X[2] = X[2] - X[6];
+    X[6] = X[0] + X[4];
+    X[0] = X[0] - X[4];
+
+    X[2] = X[2] * WB[0];
+
+    X[4] = X[6] + X[1];
+    X[6] = X[6] - X[1];
+    X[1] = X[3] + T;
+    X[3] = X[3] - T;
+    T = X[5] + X[7];
+    X[5] = X[5] - X[7];
+    X[7] = X[0] + X[2];
+    X[0] = X[0] - X[2];
+
+    X[1] = X[1] * WB[1];
+    X[5] = X[5] * WB[0];
+    X[3] = X[3] * WB[2];
+
+    X[2] = X[6] + X[5];
+    X[6] = X[6] - X[5];
+
+    X[5] = X[1] + X[3];
+    X[3] = X[1] - X[3];
+
+    X[1] = X[7] + X[5];
+    X[5] = X[7] - X[5];
+    X[7] = X[0] - X[3];
+    X[3] = X[0] + X[3];
+    X[0] = X[4] + T;
+    X[4] = X[4] - T;
+  }
+
+  __device__ __forceinline__ void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
+    uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 64 + i * 8 + column_id];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
+    uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 64 + row_id * 8 + i];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
+    uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 32 + i * 4 + column_id];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
+    uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 32 + row_id * 8 + i];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
+    uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
+
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        if (store) {
+          shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
+        } else {
+          X[4 * j + i] = shmem[ntt_id * 32 + i * 8 + column_id + j];
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
+    uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;
+
+#pragma unroll
+    for (uint32_t j = 0; j < 2; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 4; i++) {
+        if (store) {
+          shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
+        } else {
+          X[4 * j + i] = shmem[ntt_id * 32 + row_id * 4 + 4 * j + i];
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
+    uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 16 + i * 2 + column_id];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
+    uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;
+
+#pragma unroll
+    for (uint32_t i = 0; i < 8; i++) {
+      if (store) {
+        shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
+      } else {
+        X[i] = shmem[ntt_id * 16 + row_id * 8 + i];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
+    uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
+
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        if (store) {
+          shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
+        } else {
+          X[2 * j + i] = shmem[ntt_id * 16 + i * 8 + column_id + j];
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
+  {
+    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
+    uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;
+
+#pragma unroll
+    for (uint32_t j = 0; j < 4; j++) {
+#pragma unroll
+      for (uint32_t i = 0; i < 2; i++) {
+        if (store) {
+          shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
+        } else {
+          X[2 * j + i] = shmem[ntt_id * 16 + row_id * 2 + 2 * j + i];
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void twiddlesInternal()
+  {
+#pragma unroll
+    for (int i = 1; i < 8; i++) {
+      X[i] = X[i] * WI[i - 1];
+    }
+  }
+
+  __device__ __forceinline__ void twiddlesExternal()
+  {
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+      X[i] = X[i] * WE[i];
+    }
+  }
+};
+
+#endif
--- a/icicle/appUtils/poseidon/.gitignore
+++ b/icicle/appUtils/poseidon/.gitignore
@@ -0,0 +1 @@
+test_poseidon
--- a/icicle/appUtils/poseidon/Makefile
+++ b/icicle/appUtils/poseidon/Makefile
@@ -0,0 +1,3 @@
+test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
+	nvcc -o test_poseidon -I. -I../.. test.cu
+	./test_poseidon
--- a/icicle/appUtils/poseidon/constants.cu
+++ b/icicle/appUtils/poseidon/constants.cu
@@ -0,0 +1,118 @@
+#include "poseidon.cuh"
+
+/// These are pre-calculated constants for different curves
+#if CURVE_ID == BN254
+#include "appUtils/poseidon/constants/bn254_poseidon.h"
+using namespace poseidon_constants_bn254;
+#elif CURVE_ID == BLS12_381
+#include "appUtils/poseidon/constants/bls12_381_poseidon.h"
+using namespace poseidon_constants_bls12_381;
+#elif CURVE_ID == BLS12_377
+#include "appUtils/poseidon/constants/bls12_377_poseidon.h"
+using namespace poseidon_constants_bls12_377;
+#elif CURVE_ID == BW6_761
+#include "appUtils/poseidon/constants/bw6_761_poseidon.h"
+using namespace poseidon_constants_bw6_761;
+#elif CURVE_ID == GRUMPKIN
+#include "appUtils/poseidon/constants/grumpkin_poseidon.h"
+using namespace poseidon_constants_grumpkin;
+#endif
+
+namespace poseidon {
+  template <typename S>
+  cudaError_t create_optimized_poseidon_constants(
+    int arity,
+    int full_rounds_half,
+    int partial_rounds,
+    const S* constants,
+    device_context::DeviceContext& ctx,
+    PoseidonConstants<S>* poseidon_constants)
+  {
+    CHK_INIT_IF_RETURN();
+    cudaStream_t& stream = ctx.stream;
+    int width = arity + 1;
+    int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
+    int mds_matrix_len = width * width;
+    int sparse_matrices_len = (width * 2 - 1) * partial_rounds;
+    int constants_len = round_constants_len + mds_matrix_len * 2 + sparse_matrices_len;
+
+    // Malloc memory for copying constants
+    S* d_constants;
+    CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
+
+    // Copy constants
+    CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
+
+    S* round_constants = d_constants;
+    S* mds_matrix = round_constants + round_constants_len;
+    S* non_sparse_matrix = mds_matrix + mds_matrix_len;
+    S* sparse_matrices = non_sparse_matrix + mds_matrix_len;
+
+    // Pick the domain_tag accordinaly
+    // For now, we only support Merkle tree mode
+    uint32_t tree_domain_tag_value = 1;
+    tree_domain_tag_value = (tree_domain_tag_value << (width - 1)) - tree_domain_tag_value;
+    S domain_tag = S::from(tree_domain_tag_value);
+
+    // Make sure all the constants have been copied
+    CHK_IF_RETURN(cudaStreamSynchronize(stream));
+    *poseidon_constants = {arity,      partial_rounds,    full_rounds_half, round_constants,
+                           mds_matrix, non_sparse_matrix, sparse_matrices,  domain_tag};
+
+    return CHK_LAST();
+  }
+
+  template <typename S>
+  cudaError_t init_optimized_poseidon_constants(
+    int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
+  {
+    CHK_INIT_IF_RETURN();
+    int full_rounds_half = FULL_ROUNDS_DEFAULT;
+    int partial_rounds;
+    unsigned char* constants;
+    switch (arity) {
+    case 2:
+      constants = poseidon_constants_2;
+      partial_rounds = partial_rounds_2;
+      break;
+    case 4:
+      constants = poseidon_constants_4;
+      partial_rounds = partial_rounds_4;
+      break;
+    case 8:
+      constants = poseidon_constants_8;
+      partial_rounds = partial_rounds_8;
+      break;
+    case 11:
+      constants = poseidon_constants_11;
+      partial_rounds = partial_rounds_11;
+      break;
+    default:
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument, "init_optimized_poseidon_constants: #arity must be one of [2, 4, 8, 11]");
+    }
+    S* h_constants = reinterpret_cast<S*>(constants);
+
+    create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);
+
+    return CHK_LAST();
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, CreateOptimizedPoseidonConstants)(
+    int arity,
+    int full_rounds_half,
+    int partial_rounds,
+    const curve_config::scalar_t* constants,
+    device_context::DeviceContext& ctx,
+    PoseidonConstants<curve_config::scalar_t>* poseidon_constants)
+  {
+    return create_optimized_poseidon_constants<curve_config::scalar_t>(
+      arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitOptimizedPoseidonConstants)(
+    int arity, device_context::DeviceContext& ctx, PoseidonConstants<curve_config::scalar_t>* constants)
+  {
+    return init_optimized_poseidon_constants<curve_config::scalar_t>(arity, ctx, constants);
+  }
+} // namespace poseidon
--- a/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
--- a/Show More
+++ b/Show More