refactor

2026-01-13 01:17:57 -05:00 · 2024-02-28 11:41:09 -04:00 · 2024-02-28 11:37:25 -04:00 · 2024-02-28 11:29:29 -04:00 · 2024-02-28 11:27:33 -04:00 · 2024-02-28 11:19:59 -04:00
635 changed files with 9550 additions and 41908 deletions
--- a/.codespellignore
+++ b/.codespellignore
@@ -3,4 +3,3 @@ crate
 lmit
 mut
 uint
-dout
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -1,10 +1,10 @@
 golang:
-  - wrappers/golang/**/*.go
-  - wrappers/golang/**/*.h
-  - wrappers/golang/**/*.tmpl
+  - wrappers/golang/**/*.go'
+  - wrappers/golang/**/*.h'
+  - wrappers/golang/**/*.tmpl'
  - go.mod
 rust:
-  - wrappers/rust/**/*
+  - wrappers/rust
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -1,39 +0,0 @@
-name: Check Changed Files
-
-on:
-  workflow_call:
-    outputs:
-      golang:
-        description: "Flag for if GoLang files changed"
-        value: ${{ jobs.check-changed-files.outputs.golang }}
-      rust:
-        description: "Flag for if Rust files changed"
-        value: ${{ jobs.check-changed-files.outputs.rust }}
-      cpp_cuda:
-        description: "Flag for if C++/CUDA files changed"
-        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
-
-jobs:
-  check-changed-files:
-    name: Check Changed Files
-    runs-on: ubuntu-22.04
-    outputs:
-      golang: ${{ steps.changed_files.outputs.golang }}
-      rust: ${{ steps.changed_files.outputs.rust }}
-      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Get all changed files
-      id: changed-files-yaml
-      uses: tj-actions/changed-files@v39
-      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
-      with:
-        files_yaml_from_source_file: .github/changed-files.yml
-    - name: Run Changed Files script
-      id: changed_files
-      # https://github.com/tj-actions/changed-files#outputs-
-      run: |
-        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -4,14 +4,14 @@ on:
  pull_request:
    branches:
      - main
-      - V2
+      - dev

 jobs:
  spelling-checker:
    name: Check Spelling
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - uses: codespell-project/actions-codespell@v2
        with:
          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -1,74 +0,0 @@
-name: C++/CUDA
-
-on:
-  pull_request:
-    branches:
-      - main
-      - V2
-  push:
-    branches:
-      - main
-      - V2
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Check clang-format
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
-
-  test-linux-curve:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Build curve
-      working-directory: ./icicle
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve }} -DG2=ON -S . -B build
-        cmake --build build -j
-    - name: Run C++ curve Tests
-      working-directory: ./icicle/build/tests
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ctest
-
-  test-linux-field:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        field: [babybear]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Build field
-      working-directory: ./icicle
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DFIELD=${{ matrix.field }} -DEXT_FIELD=ON -S . -B build
-        cmake --build build -j
-    - name: Run C++ field Tests
-      working-directory: ./icicle/build/tests
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ctest
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/**'
+      - 'docs/*'

 permissions:
  contents: write
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -11,29 +11,24 @@ on:
  pull_request:
    branches:
      - main
-      - V2
+      - dev
  push:
    branches:
      - main
-      - V2
+      - dev

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  run-examples:
+jobs:  
+  test-examples:
    runs-on: [self-hosted, Linux, X64, icicle, examples]
-    needs: check-changed-files
    steps:
    - name: Checkout
-      uses: actions/checkout@v4
+      uses: actions/checkout@v2
    - name: c++ examples
      working-directory: ./examples/c++
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +42,6 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
-      if: needs.check-changed-files.outputs.rust == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -1,189 +0,0 @@
-name: GoLang
-
-on:
-  pull_request:
-    branches:
-      - main
-      - V2
-  push:
-    branches:
-      - main
-      - V2
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Check gofmt
-      if: needs.check-changed-files.outputs.golang == 'true'
-      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
-
-  build-curves-linux:
-    name: Build curves on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        curve: 
-          - name: bn254
-            build_args: -g2 -ecntt
-          - name: bls12_381
-            build_args: -g2 -ecntt
-          - name: bls12_377
-            build_args: -g2 -ecntt
-          - name: bw6_761
-            build_args: -g2 -ecntt
-          - name: grumpkin
-            build_args:
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Build
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
-          icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
-        retention-days: 1
- 
-  build-fields-linux:
-    name: Build fields on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        field:
-          - name: babybear
-            build_args: -field-ext
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Build
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
-        retention-days: 1
-    
-  build-hashes-linux:
-    name: Build hashes on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        hash:
-          - name: keccak
-            build_args:
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Build
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }} # builds a single hash algorithm
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.hash.name }}-${{ github.workflow }}-${{ github.sha }}
-        path: |
-          icicle/build/lib/libingo_hash.a
-        retention-days: 1
-  
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-curves-linux, build-fields-linux, build-hashes-linux]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Download ICICLE lib artifacts
-      uses: actions/download-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        path: ./icicle/build/lib
-        merge-multiple: true
-    - name: Run Tests
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # -count ensures the test results are not cached
-      # -p controls the number of programs that can be run in parallel
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        go test ./... -count=1 -failfast -p 2 -timeout 60m
-  
-  # TODO: bw6 on windows requires more memory than the standard runner has
-  # Add a large runner and then enable this job
-  # build-windows:
-  #   name: Build on Windows
-  #   runs-on: windows-2022
-  #   needs: [check-changed-files, check-format]
-  #   strategy:
-  #     matrix:
-  #       curve: [bn254, bls12_381, bls12_377, bw6_761]
-  #   steps:     
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v4
-  #   - name: Setup go
-  #     uses: actions/setup-go@v5
-  #     with:
-  #       go-version: '1.20.0'
-  #   - name: Download and Install Cuda
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     id: cuda-toolkit
-  #     uses: Jimver/cuda-toolkit@v0.2.11
-  #     with:
-  #       cuda: '12.0.0'
-  #       method: 'network'
-  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-  #   - name: Build libs
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     working-directory: ./wrappers/golang
-  #     env:
-  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-  #     shell: pwsh
-  #     run: ./build.ps1 ${{ matrix.curve }} ON # builds a single curve with G2 enabled
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -0,0 +1,119 @@
+name: Build
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  build-rust-linux:
+    name: Build Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build Rust
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-rust-windows:
+    name: Build Rust on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build Rust Targets
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/golang
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh ${{ matrix.curve }} ON
+
+  # TODO: Add once Golang make file supports building for Windows
+  # build-golang-windows:
+  #   name: Build Golang on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust"]'
+  #   - name: Build cpp libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -0,0 +1,47 @@
+name: Format
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  formatting-rust:
+    name: Check Rust Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check rustfmt
+      working-directory: ./wrappers/rust
+      # "-name tagret -prune" removes searching in any directory named "target"
+      # Formatting by single file is necessary due to generated files not being present
+      # before building the project.
+      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
+      # causing rustfmt to fail.
+      run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
+    # - name: Check clippy
+    #   run: cargo clippy --no-deps --all-features --all-targets
+
+  formatting-golang:
+    name: Check Golang Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check gofmt
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  formatting-cpp-cuda:
+    name: Check C++/CUDA Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check clang-format
+      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -0,0 +1,99 @@
+name: Test
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  test-rust-linux:
+    name: Test Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Run Rust Tests
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Running tests from the root workspace will run all workspace members' tests by default
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+
+  test-cpp-linux:
+    name: Test C++ on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build C++
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build
+        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
+        cmake --build build
+    - name: Run C++ Tests
+      working-directory: ./icicle/build
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+  
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    # strategy:
+    #   matrix:
+    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # builds all curves with g2 ON
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh all ON
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -timeout 60m
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,50 +0,0 @@
-name: Release
-
-on:
-  workflow_dispatch:
-    inputs:
-      releaseType:
-        description: 'Release type'
-        required: true
-        default: 'minor'
-        type: choice
-        options:
-          - patch
-          - minor
-          - major
-
-jobs:
-  release:
-    name: Release
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ssh-key: ${{ secrets.DEPLOY_KEY }}
-      - name: Setup Cache
-        id: cache
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-          key: ${{ runner.os }}-cargo-${{ hashFiles('~/.cargo/bin/cargo-workspaces') }}
-      - name: Install cargo-workspaces
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: cargo install cargo-workspaces
-      - name: Bump rust crate versions, commit, and tag
-        working-directory: wrappers/rust
-        # https://github.com/pksunkara/cargo-workspaces?tab=readme-ov-file#version
-        run: |
-          git config user.name release-bot
-          git config user.email release-bot@ingonyama.com
-          cargo workspaces version ${{ inputs.releaseType }} -y --no-individual-tags -m "Bump rust crates' version"
-      - name: Create draft release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          LATEST_TAG=$(git describe --tags --abbrev=0)
-          gh release create $LATEST_TAG --generate-notes -d --verify-tag -t "Release $LATEST_TAG"
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -1,104 +0,0 @@
-name: Rust
-
-on:
-  pull_request:
-    branches:
-      - main
-      - V2
-  push:
-    branches:
-      - main
-      - V2
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Check rustfmt
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      working-directory: ./wrappers/rust
-      # "-name target -prune" removes searching in any directory named "target"
-      # Formatting by single file is necessary due to generated files not being present
-      # before building the project.
-      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
-      # causing rustfmt to fail.
-      run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
-
-  build-linux:
-    name: Build on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Build
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
-  
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-linux]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Run tests
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Running tests from the root workspace will run all workspace members' tests by default
-      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      # ignored tests are polynomial tests. Since they conflict with NTT tests, they are executed separately
-      run: |
-        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2 --ignored
-        cargo test --workspace --exclude icicle-babybear --exclude icicle-stark252 --release --verbose --features=g2 -- --test-threads=2
-
-    - name: Run baby bear tests
-      working-directory: ./wrappers/rust/icicle-fields/icicle-babybear
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        cargo test --release --verbose -- --ignored
-        cargo test --release --verbose
-
-    - name: Run stark252 tests
-      working-directory: ./wrappers/rust/icicle-fields/icicle-stark252
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        cargo test --release --verbose -- --ignored
-        cargo test --release --verbose
-
-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  test-deploy:
-    name: Test deployment of docs website
+    name: Test deployment of docs webiste
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,6 @@
 **/Cargo.lock
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
-**/build*
+**/build
 **/icicle/appUtils/large_ntt/work
 icicle/appUtils/large_ntt/work/test_ntt
--- a/2
+++ b/2
@@ -15,7 +15,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"

 # Install Golang
 ENV GOLANG_VERSION 1.21.1
-RUN curl -L https://go.dev/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
 ENV PATH="/usr/local/go/bin:${PATH}"

 # Set the working directory in the container
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # ICICLE

-<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>
+**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**

 <p align="center">
  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
@@ -11,12 +11,10 @@
  </a>
  <a href="https://twitter.com/intent/follow?screen_name=Ingo_zk">
    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
-  <a href="https://github.com/ingonyama-zk/icicle/releases">
-    <img src="https://img.shields.io/github/v/release/ingonyama-zk/icicle" alt="GitHub Release">
  </a>
+  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
 </p>

-
 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
@@ -115,10 +113,8 @@ This will ensure our custom hooks are run and will make it easier to follow our

 - [Robik](https://github.com/robik75), for his ongoing support and mentorship
 - [liuxiao](https://github.com/liuxiaobleach), for being a top notch bug smasher
- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab
+- [gkigiermo](https://github.com/gkigiermo), for making it intuitive to use ICICLE in Google Colab.
 - [nonam3e](https://github.com/nonam3e), for adding Grumpkin curve support into ICICLE
- [alxiong](https://github.com/alxiong), for adding warmup for CudaStream
- [cyl19970726](https://github.com/cyl19970726), for updating go install source in Dockerfile

 ## Help & Support

--- a/docs/docs/icicle/core.md
+++ b/docs/docs/icicle/core.md
@@ -1,181 +0,0 @@
-# ICICLE Core
-
-ICICLE Core is a library written in C++/CUDA. All the ICICLE primitives are implemented within ICICLE Core.
-
-The Core is split into logical modules that can be compiled into static libraries using different [strategies](#compilation-strategies). You can then [link](#linking) these libraries with your C++ project or write your own [bindings](#writing-new-bindings-for-icicle) for other programming languages. If you want to use ICICLE with existing bindings please refer to [Rust](/icicle/rust-bindings) / [Golang](/icicle/golang-bindings).
-
-## Compilation strategies
-
-Most of the codebase is curve/field agnostic, which means it can be compiled for different curves and fields. When you build ICICLE Core you choose a single curve or field. If you need multiple curves or fields  - you just compile ICICLE into multiple static libraries. It's that simple. Currently, the following choices are supported:
-
- - [Field mode](#compiling-for-a-field) - used for STARK fields like BabyBear / Mersenne / Goldilocks. Includes field arithmetic, NTT, Poseidon, Extension fields and other primitives.
- - [Curve mode](#compiling-for-a-curve) - used for SNARK curves like BN254/ BLS curves / Grumpkin / etc. Curve mode is built upon field mode, so it includes everything that field does. It also includes curve operations / MSM / ECNTT / G2 and other curve-related primitives.
-
-:::info
-
-If you only want to use curve's scalar/base field, you still need to go with a curve mode. You can disable MSM with [options](#compilation-options)
-
-:::
-
-### Compiling for a field
-
-ICICLE supports the following STARK fields:
- - [BabyBear](https://eprint.iacr.org/2023/824.pdf)
-
-Field mode includes:
- - [Field arithmetic](https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/fields/field.cuh) - field multiplication, addition, subtraction
- - [NTT](icicle/primitives/ntt) - FFT / iFFT
- - [Poseidon Hash](icicle/primitives/poseidon)
- - [Vector operations](https://github.com/ingonyama-zk/icicle/blob/main/icicle/include/vec_ops/vec_ops.cuh)
- - [Polynomial](#) - structs and methods to work with polynomials
-
-You can compile ICICLE for a STARK field using this command:
-
-```sh
-cd icicle
-mkdir -p build
-cmake -DFIELD=<FIELD> -S . -B build
-cmake --build build -j
-```
-
-Icicle Supports the following `<FIELD>` FIELDS:
- `babybear`
-
-This command will output `libingo_field_<FIELD>.a` into `build/lib`.
-
-### Compiling for a curve
-
-ICICLE supports the following SNARK curves:
- - [BN254](https://neuromancer.sk/std/bn/bn254)
- - [BLS12-377](https://neuromancer.sk/std/bls/BLS12-377)
- - [BLS12-381](https://neuromancer.sk/std/bls/BLS12-381)
- - [BW6-761](https://eprint.iacr.org/2020/351)
- - Grumpkin
-
-Curve mode includes everything you can find in field mode with addition of:
- - [MSM](icicle/primitives/msm) - MSM / Batched MSM
- - [ECNTT](#)
-
-:::note
-
-Field related primitives will be compiled for the scalar field of the curve
-
-:::
-
-You can compile ICICLE for a SNARK curve using this command:
-
-```sh
-cd icicle
-mkdir -p build
-cmake -DCURVE=<CURVE> -S . -B build
-cmake --build build -j
-```
-
-Where `<CURVE>` can be one of `bn254`/`bls12_377`/`bls12_381`/`bw6_761`/`grumpkin`.
-
-This command will output both `libingo_curve_<CURVE>.a` and `libingo_field_<CURVE>.a` into `build/lib`.
-
-### Compilation options
-
-There exist multiple options that allow you to customize your build or enable additional functionality.
-
-#### EXT_FIELD
-
-Used only in a [field mode](#compiling-for-a-field) to add Extension field into a build. Adds NTT for the extension field.
-
-Default: `OFF`
-
-Usage: `-DEXT_FIELD=ON`
-
-#### G2
-
-Used only in a [curve mode](#compiling-for-a-curve) to add G2 definitions into a build. Also adds G2 MSM.
-
-Default: `OFF`
-
-Usage: `-DG2=ON`
-
-#### ECNTT
-
-Used only in a [curve mode](#compiling-for-a-curve) to add ECNTT function into a build.
-
-Default: `OFF`
-
-Usage: `-DECNTT=ON`
-
-#### MSM
-
-Used only in a [curve mode](#compiling-for-a-curve) to add MSM function into a build. As MSM takes a lot of time to build, you can disable it with this option to reduce compilation time.
-
-Default: `ON`
-
-Usage: `-DMSM=OFF`
-
-#### BUILD_HASH
-
-Can be used in any mode to build a hash library. Currently it only includes Keccak hash function, but more are coming.
-
-Default: `OFF`
-
-Usage: `-DBUILD_HASH=ON`
-
-#### BUILD_TESTS
-
-Can be used in any mode to include tests runner binary.
-
-Default: `OFF`
-
-USAGE: `-DBUILD_TESTS=ON`
-
-#### BUILD_BENCHMARKS
-
-Can be used in any mode to include benchmarks runner binary.
-
-Default: `OFF`
-
-USAGE: `-DBUILD_BENCHMARKS=ON`
-
-#### DEVMODE
-
-Can be used in any mode to include debug symbols in the build.
-
-Default: `OFF`
-
-USAGE: `-DEVMODE=ON`
-
-## Linking
-
-To link ICICLE with your project you first need to compile ICICLE with options of your choice. After that you can use CMake `target_link_libraries` to link with the generated static libraries and `target_include_directories` to include ICICLE headers (located in `icicle/include`).
-
-Refer to our [c++ examples](https://github.com/ingonyama-zk/icicle/tree/main/examples/c%2B%2B) for more info. Take a look at this [CMakeLists.txt](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/CMakeLists.txt#L22)
-
-
-## Writing new bindings for ICICLE
-
-Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
-
-Static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
-
-Let's review the [Golang bindings](golang-bindings.md) since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-  // Now you can call the C functions from the ICICLE libraries.
-  // Note that C function calls are prefixed with 'C.' in Go code.
-
-  out := (*C.BN254_projective_t)(unsafe.Pointer(p))
-  in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
-
-  C.projective_from_affine_bn254(out, in)
-}
-```
-
-The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
-
-If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,108 +1,3 @@
 # Golang bindings

-Golang bindings allow you to use ICICLE as a golang library.
-The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
-
-The Golang bindings are comprised of multiple packages.
-
-[`core`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/core) which defines all shared methods and structures, such as configuration structures, or memory slices.
-
-[`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.
-
-Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
-
-## Using ICICLE Golang bindings in your project
-
-To add ICICLE to your `go.mod` file.
-
-```bash
-go get github.com/ingonyama-zk/icicle
-```
-
-If you want to specify a specific branch
-
-```bash
-go get github.com/ingonyama-zk/icicle@<branch_name>
-```
-
-For a specific commit
-
-```bash
-go get github.com/ingonyama-zk/icicle@<commit_id>
-```
-
-To build the shared libraries you can run this script:
-
-```bash
-./build.sh [-curve=<curve> | -field=<field>] [-cuda_version=<version>] [-g2] [-ecntt] [-devmode]
-```
- **`curve`** - The name of the curve to build or "all" to build all curves
- **`field`** - The name of the field to build or "all" to build all fields
- **`g2`** - Optional - build with G2 enabled 
- **`ecntt`** - Optional - build with ECNTT enabled
- **`devmode`** - Optional - build in devmode
- Usage can be displayed with the flag `-help`
-
-To build ICICLE libraries for all supported curves with G2 and ECNTT enabled.
-
-```bash
-./build.sh all -g2 -ecntt
-```
-
-If you wish to build for a specific curve, for example bn254, without G2 or ECNTT enabled.
-
-``` bash
-./build.sh -curve=bn254
-```
-
-Now you can import ICICLE into your project
-
-```go
-import (
-    "github.com/stretchr/testify/assert"
-    "testing"
-
-    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-)
-...
-```
-
-## Running tests
-
-To run all tests, for all curves:
-
-```bash
-go test --tags=g2 ./... -count=1
-```
-
-If you dont want to include g2 tests then drop `--tags=g2`.
-
-If you wish to run test for a specific curve:
-
-```bash
-go test <path_to_curve> -count=1
-```
-
-## How do Golang bindings work?
-
-The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
-
-1. These libraries (named `libingo_curve_<curve>.a` and `libingo_field_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
-
-2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lingo_curve_bn254 -L$/path/to/shared/libs -lingo_field_bn254 -lstdc++ -lm
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-    // Now you can call the C functions from the ICICLE libraries.
-    // Note that C function calls are prefixed with 'C.' in Go code.
-}
-```
-
-Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docs/icicle/golang-bindings/ecntt.md
+++ b/docs/docs/icicle/golang-bindings/ecntt.md
@@ -1,97 +0,0 @@
-# ECNTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`
-
-## ECNTT Method
-
-The `ECNtt[T any]()` function performs the Elliptic Curve Number Theoretic Transform (EC-NTT) on the input points slice, using the provided dir (direction), cfg (configuration), and stores the results in the results slice.
-
-```go
-func ECNtt[T any](points core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
-```
-
-### Parameters:
-
- **`points`**: A slice of elliptic curve points (in projective coordinates) that will be transformed. The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.
- **`dir`**: The direction of the EC-NTT transform, either `core.KForward` or `core.KInverse`.
- **`cfg`**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
- **`results`**: A slice that will store the transformed elliptic curve points (in projective coordinates). The slice can be stored on the host or the device, as indicated by the `core.HostOrDeviceSlice` type.
-
-
-### Return Value
-
- **`CudaError`**: A `core.IcicleError` value, which will be `core.IcicleErrorCode(0)` if the EC-NTT operation was successful, or an error if something went wrong.
-
-## NTT Configuration (NTTConfig)
-
-The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
-
-```go
-type NTTConfig[T any] struct {
-    Ctx cr.DeviceContext
-    CosetGen T
-    BatchSize int32
-    ColumnsBatch bool
-    Ordering Ordering
-    areInputsOnDevice  bool
-    areOutputsOnDevice bool
-    IsAsync bool
-    NttAlgorithm NttAlgorithm
-}
-```
-
-### Fields
-
- **`Ctx`**: Device context containing details like device ID and stream ID.
- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`), affecting how data is arranged.
- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
- **`NttAlgorithm`**: Explicitly select the NTT algorithm. ECNTT supports running on `Radix2` algoruithm.
-
-### Default Configuration
-
-Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
-
-```go
-func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
-```
-
-## ECNTT Example
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-)
-
-func Main() {
-    // Obtain the default NTT configuration with a predefined coset generator.
-    cfg := GetDefaultNttConfig()
-    
-    // Define the size of the input scalars.
-    size := 1 << 18
-
-    // Generate Points for the ECNTT operation.
-    points := GenerateProjectivePoints(size)
-    
-    // Set the direction of the NTT (forward or inverse).
-    dir := core.KForward
-
-    // Allocate memory for the results of the NTT operation.
-    results := make(core.HostSlice[Projective], size)
-
-    // Perform the NTT operation.
-    err := ECNtt(points, dir, &cfg, results)
-    if err != cr.CudaSuccess {
-        panic("ECNTT operation failed")
-    }
-}
-```
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -1,116 +0,0 @@
-# MSM Pre computation
-
-To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-## Core package
-
-## MSM `PrecomputeBases`
-
-`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves. 
-
-#### Description
-
-This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
-
-The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
-
-#### `PrecomputeBases`
-
-Precomputes bases for MSM by extending each base point with its multiples.
-
-```go
-func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
-```
-
-##### Parameters
-
- **`points`**: A slice of the original affine points to be extended with their multiples.
- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
- **`c`**: Currently unused; reserved for future compatibility.
- **`ctx`**: CUDA device context specifying the execution environment.
- **`outputBases`**: The device slice allocated for storing the extended bases.
-
-##### Example
-
-```go
-package main
-
-import (
-	"log"
-
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	cfg := bn254.GetDefaultMSMConfig()
-	points := bn254.GenerateAffinePoints(1024)
-	var precomputeFactor int32 = 8
-	var precomputeOut core.DeviceSlice
-	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-
-	err := bn254.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-	if err != cr.CudaSuccess {
-		log.Fatalf("PrecomputeBases failed: %v", err)
-	}
-}
-```
-
-#### `G2PrecomputeBases`
-
-This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
-
-```go
-func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
-```
-
-##### Parameters
-
- **`points`**: A slice of G2 curve points to be extended.
- **`precomputeFactor`**: The total number of points to precompute for each base.
- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
- **`ctx`**: Specifies the CUDA device context for execution.
- **`outputBases`**: Allocated device slice for the extended bases.
-
-##### Example
-
-```go
-package main
-
-import (
-	"log"
-
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
-)
-
-func main() {
-	cfg := g2.G2GetDefaultMSMConfig()
-	points := g2.G2GenerateAffinePoints(1024)
-	var precomputeFactor int32 = 8
-	var precomputeOut core.DeviceSlice
-	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-
-	err := g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-	if err != cr.CudaSuccess {
-		log.Fatalf("PrecomputeBases failed: %v", err)
-	}
-}
-```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -1,201 +0,0 @@
-# MSM
-
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-## MSM Example
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	// Obtain the default MSM configuration.
-	cfg := bn254.GetDefaultMSMConfig()
-
-	// Define the size of the problem, here 2^18.
-	size := 1 << 18
-
-	// Generate scalars and points for the MSM operation.
-	scalars := bn254.GenerateScalars(size)
-	points := bn254.GenerateAffinePoints(size)
-
-	// Create a CUDA stream for asynchronous operations.
-	stream, _ := cr.CreateStream()
-	var p bn254.Projective
-
-	// Allocate memory on the device for the result of the MSM operation.
-	var out core.DeviceSlice
-	_, e := out.MallocAsync(p.Size(), p.Size(), stream)
-
-	if e != cr.CudaSuccess {
-		panic(e)
-	}
-
-	// Set the CUDA stream in the MSM configuration.
-	cfg.Ctx.Stream = &stream
-	cfg.IsAsync = true
-
-	// Perform the MSM operation.
-	e = bn254.Msm(scalars, points, &cfg, out)
-
-	if e != cr.CudaSuccess {
-		panic(e)
-	}
-
-	// Allocate host memory for the results and copy the results from the device.
-	outHost := make(core.HostSlice[bn254.Projective], 1)
-	cr.SynchronizeStream(&stream)
-	outHost.CopyFromDevice(&out)
-
-	// Free the device memory allocated for the results.
-	out.Free()
-}
-
-```
-
-## MSM Method
-
-```go
-func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) cr.CudaError
-```
-
-### Parameters
-
- **`scalars`**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
- **`points`**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
- **`cfg`**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
- **`results`**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.
-
-### Return Value
-
- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the MSM operation.
-
-## MSMConfig
-
-The `MSMConfig` structure holds configuration parameters for the MSM operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
-
-```go
-type MSMConfig struct {
-    Ctx cr.DeviceContext
-    PrecomputeFactor int32
-    C int32
-    Bitsize int32
-    LargeBucketFactor int32
-    batchSize int32
-    areScalarsOnDevice bool
-    AreScalarsMontgomeryForm bool
-    arePointsOnDevice bool
-    ArePointsMontgomeryForm bool
-    areResultsOnDevice bool
-    IsBigTriangle bool
-    IsAsync bool
-}
-```
-
-### Fields
-
- **`Ctx`**: Device context containing details like device id and stream.
- **`PrecomputeFactor`**: Controls the number of extra points to pre-compute.
- **`C`**: Window bitsize, a key parameter in the "bucket method" for MSM.
- **`Bitsize`**: Number of bits of the largest scalar.
- **`LargeBucketFactor`**: Sensitivity to frequently occurring buckets.
- **`batchSize`**: Number of results to compute in one batch.
- **`areScalarsOnDevice`**: Indicates if scalars are located on the device.
- **`AreScalarsMontgomeryForm`**: True if scalars are in Montgomery form.
- **`arePointsOnDevice`**: Indicates if points are located on the device.
- **`ArePointsMontgomeryForm`**: True if point coordinates are in Montgomery form.
- **`areResultsOnDevice`**: Indicates if results are stored on the device.
- **`IsBigTriangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`IsAsync`**: If true, runs MSM asynchronously.
-
-### Default Configuration
-
-Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be customized as needed.
-
-```go
-func GetDefaultMSMConfig() MSMConfig
-```
-
-
-## How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
-
-```go
-...
-
-// Obtain the default MSM configuration.
-cfg := GetDefaultMSMConfig()
-
-cfg.Ctx.IsBigTriangle = true
-
-...
-```
-
-## How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `MSM` function.
-
-The number of results is interpreted from the size of `var out core.DeviceSlice`. Thus its important when allocating memory for `var out core.DeviceSlice` to make sure that you are allocating `<number of results> X <size of a single point>`.
-
-```go
-... 
-
-batchSize := 3
-var p G2Projective
-var out core.DeviceSlice
-out.Malloc(batchSize*p.Size(), p.Size())
-
-...
-```
-
-## Support for G2 group
-
-To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled as described in the [Golang building instructions](../golang-bindings.md#using-icicle-golang-bindings-in-your-project).
-
-
-
-Now you may import `g2` package of the specified curve.
-
-```go
-import (
-    "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls254/g2"
-)
-```
-
-This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
-)
-
-func main() {
-	cfg := bn254.GetDefaultMSMConfig()
-	size := 1 << 12
-	batchSize := 3
-	totalSize := size * batchSize
-	scalars := bn254.GenerateScalars(totalSize)
-	points := g2.G2GenerateAffinePoints(totalSize)
-
-	var p g2.G2Projective
-	var out core.DeviceSlice
-	out.Malloc(batchSize*p.Size(), p.Size())
-	g2.G2Msm(scalars, points, &cfg, out)
-}
-
-```
-
-`G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
--- a/docs/docs/icicle/golang-bindings/multi-gpu.md
+++ b/docs/docs/icicle/golang-bindings/multi-gpu.md
@@ -1,150 +0,0 @@
-# Multi GPU APIs
-
-To learn more about the theory of Multi GPU programming refer to [this part](../multi-gpu.md) of documentation.
-
-Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)
-
-
-## A Multi GPU example
-
-In this example we will display how you can
-
-1. Fetch the number of devices installed on a machine
-2. For every GPU launch a thread and set an active device per thread.
-3. Execute a MSM on each GPU
-
-
-```go
-package main
-
-import (
-	"fmt"
-	"sync"
-
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	numDevices, _ := cr.GetDeviceCount()
-	fmt.Println("There are ", numDevices, " devices available")
-	wg := sync.WaitGroup{}
-
-	for i := 0; i < numDevices; i++ {
-		wg.Add(1)
-		// RunOnDevice makes sure each MSM runs on a single thread
-		cr.RunOnDevice(i, func(args ...any) {
-			defer wg.Done()
-			cfg := bn254.GetDefaultMSMConfig()
-			cfg.IsAsync = true
-			for _, power := range []int{10, 18} {
-				size := 1 << power // 2^pwr
-
-				// generate random scalars
-				scalars := bn254.GenerateScalars(size)
-				points := bn254.GenerateAffinePoints(size)
-
-				// create a stream and allocate result pointer
-				stream, _ := cr.CreateStream()
-				var p bn254.Projective
-				var out core.DeviceSlice
-				out.MallocAsync(p.Size(), p.Size(), stream)
-				// assign stream to device context
-				cfg.Ctx.Stream = &stream
-
-				// execute MSM
-				bn254.Msm(scalars, points, &cfg, out)
-				// read result from device
-				outHost := make(core.HostSlice[bn254.Projective], 1)
-				outHost.CopyFromDeviceAsync(&out, stream)
-				out.FreeAsync(stream)
-
-				// sync the stream
-				cr.SynchronizeStream(&stream)
-			}
-		})
-	}
-	wg.Wait()
-}
-```
-
-This example demonstrates a basic pattern for distributing tasks across multiple GPUs. The `RunOnDevice` function ensures that each goroutine is executed on its designated GPU and a corresponding thread.
-
-## Device Management API
-
-To streamline device management we offer as part of `cuda_runtime` package methods for dealing with devices.
-
-### `RunOnDevice`
-
-Runs a given function on a specific GPU device, ensuring that all CUDA calls within the function are executed on the selected device.
-
-In Go, most concurrency can be done via Goroutines. However, there is no guarantee that a goroutine stays on a specific host thread. 
-
-`RunOnDevice` was designed to solve this caveat and insure that the goroutine will stay on a specific host thread.
-
-`RunOnDevice` will lock a goroutine into a specific host thread, sets a current GPU device, runs a provided function, and unlocks the goroutine from the host thread after the provided function finishes.
-
-While the goroutine is locked to the host thread, the Go runtime will not assign other goroutine's to that host thread.
-
-**Parameters:**
-
- **`deviceId int`**: The ID of the device on which to run the provided function. Device IDs start from 0.
- **`funcToRun func(args ...any)`**: The function to be executed on the specified device.
- **`args ...any`**: Arguments to be passed to `funcToRun`.
-
-**Behavior:**
-
- The function `funcToRun` is executed in a new goroutine that is locked to a specific OS thread to ensure that all CUDA calls within the function target the specified device.
- It's important to note that any goroutines launched within `funcToRun` are not automatically bound to the same GPU device. If necessary, `RunOnDevice` should be called again within such goroutines with the same `deviceId`.
-
-**Example:**
-
-```go
-RunOnDevice(0, func(args ...any) {
-	fmt.Println("This runs on GPU 0")
-	// CUDA-related operations here will target GPU 0
-}, nil)
-```
-
-### `SetDevice`
-
-Sets the active device for the current host thread. All subsequent CUDA calls made from this thread will target the specified device.
-
-**Parameters:**
-
- **`device int`**: The ID of the device to set as the current device.
-
-**Returns:**
-
- **`CudaError`**: Error code indicating the success or failure of the operation.
-
-### `GetDeviceCount`
-
-Retrieves the number of CUDA-capable devices available on the host.
-
-**Returns:**
-
- **`(int, CudaError)`**: The number of devices and an error code indicating the success or failure of the operation.
-
-### `GetDevice`
-
-Gets the ID of the currently active device for the calling host thread.
-
-**Returns:**
-
- **`(int, CudaError)`**: The ID of the current device and an error code indicating the success or failure of the operation.
-
-### `GetDeviceFromPointer`
-
-Retrieves the device associated with a given pointer.
-
-**Parameters:**
-
- **`ptr unsafe.Pointer`**: Pointer to query.
-
-**Returns:**
-
- **`int`**: The device ID associated with the memory pointed to by `ptr`.
-
-This documentation should provide a clear understanding of how to effectively manage multiple GPUs in Go applications using CUDA, with a particular emphasis on the `RunOnDevice` function for executing tasks on specific GPUs.
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -1,155 +0,0 @@
-# NTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
-## NTT Example
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-
-	"github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
-)
-
-func init() {
-	cfg := bn254.GetDefaultNttConfig()
-	initDomain(18, cfg)
-}
-
-func initDomain[T any](largestTestSize int, cfg core.NTTConfig[T]) core.IcicleError {
-	rouMont, _ := fft.Generator(uint64(1 << largestTestSize))
-	rou := rouMont.Bits()
-	rouIcicle := bn254.ScalarField{}
-
-	rouIcicle.FromLimbs(rou[:])
-	e := bn254.InitDomain(rouIcicle, cfg.Ctx, false)
-	return e
-}
-
-func main() {
-	// Obtain the default NTT configuration with a predefined coset generator.
-	cfg := bn254.GetDefaultNttConfig()
-
-	// Define the size of the input scalars.
-	size := 1 << 18
-
-	// Generate scalars for the NTT operation.
-	scalars := bn254.GenerateScalars(size)
-
-	// Set the direction of the NTT (forward or inverse).
-	dir := core.KForward
-
-	// Allocate memory for the results of the NTT operation.
-	results := make(core.HostSlice[bn254.ScalarField], size)
-
-	// Perform the NTT operation.
-	err := bn254.Ntt(scalars, dir, &cfg, results)
-	if err.CudaErrorCode != cr.CudaSuccess {
-		panic("NTT operation failed")
-	}
-}
-```
-
-## NTT Method
-
-```go
-func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
-```
-
-### Parameters
-
- **`scalars`**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
- **`dir`**: The direction of the NTT operation (`KForward` or `KInverse`).
- **`cfg`**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
- **`results`**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.
-
-### Return Value
-
- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the NTT operation.
-
-## NTT Configuration (NTTConfig)
-
-The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
-
-```go
-type NTTConfig[T any] struct {
-    Ctx cr.DeviceContext
-    CosetGen T
-    BatchSize int32
-    ColumnsBatch bool
-    Ordering Ordering
-    areInputsOnDevice  bool
-    areOutputsOnDevice bool
-    IsAsync bool
-    NttAlgorithm NttAlgorithm
-}
-```
-
-### Fields
-
- **`Ctx`**: Device context containing details like device ID and stream ID.
- **`CosetGen`**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
- **`BatchSize`**: The number of NTTs to compute in one operation, defaulting to 1.
- **`ColumnsBatch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
- **`Ordering`**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
- **`areInputsOnDevice`**: Indicates if input scalars are located on the device.
- **`areOutputsOnDevice`**: Indicates if results are stored on the device.
- **`IsAsync`**: Controls whether the NTT operation runs asynchronously.
- **`NttAlgorithm`**: Explicitly select the NTT algorithm. Default value: Auto (the implementation selects radix-2 or mixed-radix algorithm based on heuristics).
-
-### Default Configuration
-
-Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
-
-```go
-func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
-```
-
-### Initializing the NTT Domain
-
-Before performing NTT operations, it's necessary to initialize the NTT domain; it only needs to be called once per GPU since the twiddles are cached.
-
-```go
-func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bool) core.IcicleError
-```
-
-This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.
-
-### Releasing the domain
-
-The `ReleaseDomain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
-
-```go
-func ReleaseDomain(ctx cr.DeviceContext) core.IcicleError
-```
-
-### Parameters
-
- **`ctx`**: a reference to the `DeviceContext` object, which represents the CUDA device context.
-
-### Return Value
-
-The function returns a `core.IcicleError`, which represents the result of the operation. If the operation is successful, the function returns `core.IcicleErrorCode(0)`.
-
-### Example
-
-```go
-import (
-    "github.com/icicle-crypto/icicle-core/cr"
-    "github.com/icicle-crypto/icicle-core/core"
-)
-
-func example() {
-    cfg := GetDefaultNttConfig()
-	err := ReleaseDomain(cfg.Ctx)
-    if err != nil {
-        // Handle the error
-    }
-}
-```
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -1,186 +0,0 @@
-# Vector Operations
-
-## Overview
-Icicle is exposing a number of vector operations which a user can control:
-* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
-* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix
-
-
-## VecOps API Documentation
-### Example
-
-#### Vector addition
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	testSize := 1 << 12
-	a := bn254.GenerateScalars(testSize)
-	b := bn254.GenerateScalars(testSize)
-	out := make(core.HostSlice[bn254.ScalarField], testSize)
-	cfg := core.DefaultVecOpsConfig()
-
-	// Perform vector multiplication
-	err := bn254.VecOp(a, b, out, cfg, core.Add)
-	if err != cr.CudaSuccess {
-		panic("Vector addition failed")
-	}
-}
-```
-
-#### Vector Subtraction
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	testSize := 1 << 12
-	a := bn254.GenerateScalars(testSize)
-	b := bn254.GenerateScalars(testSize)
-	out := make(core.HostSlice[bn254.ScalarField], testSize)
-	cfg := core.DefaultVecOpsConfig()
-
-	// Perform vector multiplication
-	err := bn254.VecOp(a, b, out, cfg, core.Sub)
-	if err != cr.CudaSuccess {
-		panic("Vector subtraction failed")
-	}
-}
-```
-
-#### Vector Multiplication
-
-```go
-package main
-
-import (
-	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-)
-
-func main() {
-	testSize := 1 << 12
-	a := bn254.GenerateScalars(testSize)
-	b := bn254.GenerateScalars(testSize)
-	out := make(core.HostSlice[bn254.ScalarField], testSize)
-	cfg := core.DefaultVecOpsConfig()
-
-	// Perform vector multiplication
-	err := bn254.VecOp(a, b, out, cfg, core.Mul)
-	if err != cr.CudaSuccess {
-		panic("Vector multiplication failed")
-	}
-}
-```
-
-### VecOps Method
-
-```go
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
-```
-
-#### Parameters
-
- **`a`**: The first input vector.
- **`b`**: The second input vector.
- **`out`**: The output vector where the result of the operation will be stored.
- **`config`**: A `VecOpsConfig` object containing various configuration options for the vector operations.
- **`op`**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.
-
-#### Return Value
-
- **`CudaError`**: Returns a CUDA error code indicating the success or failure of the vector operation.
-
-### VecOpsConfig
-
-The `VecOpsConfig` structure holds configuration parameters for the vector operations, allowing customization of its behavior.
-
-```go
-type VecOpsConfig struct {
-    Ctx cr.DeviceContext
-    isAOnDevice bool
-    isBOnDevice bool
-    isResultOnDevice bool
-    IsAsync bool
-}
-```
-
-#### Fields
-
- **Ctx**: Device context containing details like device ID and stream ID.
- **isAOnDevice**: Indicates if vector `a` is located on the device.
- **isBOnDevice**: Indicates if vector `b` is located on the device.
- **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
- **IsAsync**: Controls whether the vector operation runs asynchronously.
-
-#### Default Configuration
-
-Use `DefaultVecOpsConfig` to obtain a default configuration, customizable as needed.
-
-```go
-func DefaultVecOpsConfig() VecOpsConfig
-```
-
-## MatrixTranspose API Documentation
-
-This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
-
-The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
-
-### Function
-
-```go
-func TransposeMatrix(in, out core.HostOrDeviceSlice, columnSize, rowSize int, ctx cr.DeviceContext, onDevice, isAsync bool) (ret core.IcicleError)
-```
-
-## Parameters
-
- **`in`**: The input matrix is a `core.HostOrDeviceSlice`, stored as a 1D slice.
- **`out`**: The output matrix is a `core.HostOrDeviceSlice`, which will be the transpose of the input matrix, stored as a 1D slice.
- **`columnSize`**: The number of columns in the input matrix.
- **`rowSize`**: The number of rows in the input matrix.
- **`ctx`**: The device context `cr.DeviceContext` to be used for the matrix transpose operation.
- **`onDevice`**: Indicates whether the input and output slices are stored on the device (GPU) or the host (CPU).
- **`isAsync`**: Indicates whether the matrix transpose operation should be executed asynchronously.
-
-## Return Value
-
-The function returns a `core.IcicleError` value, which represents the result of the matrix transpose operation. If the operation is successful, the returned value will be `0`.
-
-## Example Usage
-
-```go
-var input = make(core.HostSlice[ScalarField], 20)
-var output = make(core.HostSlice[ScalarField], 20)
-
-// Populate the input matrix
-// ...
-
-// Get device context
-ctx, _ := cr.GetDefaultDeviceContext()
-
-// Transpose the matrix
-err := TransposeMatrix(input, output, 5, 4, ctx, false, false)
-if err.IcicleErrorCode != core.IcicleErrorCode(0) {
-    // Handle the error
-}
-
-// Use the transposed matrix
-// ...
-```
-
-In this example, the `TransposeMatrix` function is used to transpose a 5x4 matrix stored in a 1D slice. The input and output slices are stored on the host (CPU), and the operation is executed synchronously.
--- a/docs/docs/icicle/integrations.md
+++ b/docs/docs/icicle/integrations.md
@@ -1,6 +1,6 @@
 # ICICLE integrated provers

-ICICLE has been used by companies and projects such as [Celer Network](https://github.com/celer-network), [Consensys Gnark](https://github.com/Consensys/gnark), [EZKL](https://blog.ezkl.xyz/post/acceleration/), [ZKWASM](https://twitter.com/DelphinusLab/status/1762604988797513915) and others to accelerate their ZK proving pipeline.
+ICICLE has been used by companies and projects such as [Celer Network](https://github.com/celer-network), [Consensys Gnark](https://github.com/Consensys/gnark), [EZKL](https://blog.ezkl.xyz/post/acceleration/) and others to accelerate their ZK proving pipeline.

 Many of these integrations have been a collaboration between Ingonyama and the integrating company. We have learned a lot about designing GPU based ZK provers.

--- a/docs/docs/icicle/introduction.md
+++ b/docs/docs/icicle/introduction.md
@@ -8,24 +8,24 @@ This guide is oriented towards developers who want to start writing code with th

 The diagram above displays the general architecture of ICICLE and the API layers that exist. The CUDA API, which we also call ICICLE Core, is the lowest level and is comprised of CUDA kernels which implement all primitives such as MSM as well as C++ wrappers which expose these methods for different curves.

-ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or linked with your C++ project. You can also implement a wrapper for it in any other language.
+ICICLE Core compiles into a static library. This library can be used with our official Golang and Rust wrappers or you can implement a wrapper for it in any language.

-Based on this dependency architecture, the ICICLE repository has three main sections:
+Based on this dependency architecture, the ICICLE repository has three main sections, each of which is independent from the other.

- [ICICLE Core](#icicle-core)
- [ICICLE Rust bindings](#icicle-rust-and-golang-bindings)
- [ICICLE Golang bindings](#icicle-rust-and-golang-bindings)
+- ICICLE core
+- ICICLE Rust bindings
+- ICICLE Golang bindings

 ### ICICLE Core

-[ICICLE Core](/icicle/core) is a library that directly works with GPU by defining CUDA kernels and algorithms that invoke them. It contains code for [fast field arithmetic](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/field/field.cuh), cryptographic primitives used in ZK such as [NTT](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/ntt/), [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/msm/), [Poseidon Hash](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/poseidon/), [Polynomials](https://github.com/ingonyama-zk/icicle/tree/main/icicle/src/polynomials/) and others.
+[ICICLE core](https://github.com/ingonyama-zk/icicle/tree/main/icicle) contains all the low level CUDA code implementing primitives such as [points](https://github.com/ingonyama-zk/icicle/tree/main/icicle/primitives) and [MSM](https://github.com/ingonyama-zk/icicle/tree/main/icicle/appUtils/msm). There also exists higher level C++ wrappers to expose the low level CUDA primitives ([example](https://github.com/ingonyama-zk/icicle/blob/c1a32a9879a7612916e05aa3098f76144de4109e/icicle/appUtils/msm/msm.cu#L1)).

-ICICLE Core would typically be compiled into a static library and either used in a third party language such as Rust or Golang, or linked with your own C++ project.
+ICICLE Core would typically be compiled into a static library and used in a third party language such as Rust or Golang.

 ### ICICLE Rust and Golang bindings

- [ICICLE Rust bindings](/icicle/rust-bindings)
- [ICICLE Golang bindings](/icicle/golang-bindings)
+- [ICICLE Rust bindings](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust)
+- [ICICLE Golang bindings](https://github.com/ingonyama-zk/icicle/tree/main/goicicle)

 These bindings allow you to easily use ICICLE in a Rust or Golang project. Setting up Golang bindings requires a bit of extra steps compared to the Rust bindings which utilize the `cargo build` tool.

@@ -33,12 +33,6 @@ These bindings allow you to easily use ICICLE in a Rust or Golang project. Setti

 This guide assumes that you have a Linux or Windows machine with an Nvidia GPU installed. If you don't have access to an Nvidia GPU you can access one for free on [Google Colab](https://colab.google/).

-:::info note
-
-ICICLE can only run on Linux or Windows. **MacOS is not supported**.
-
-:::
-
 ### Prerequisites

 - NVCC (version 12.0 or newer)
@@ -56,9 +50,9 @@ If you don't wish to install these prerequisites you can follow this tutorial us

 ### Setting up ICICLE and running tests

-The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run an ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and [skip](#icicle-core-1) the docker section.
+The objective of this guide is to make sure you can run the ICICLE Core, Rust and Golang tests. Achieving this will ensure you know how to setup ICICLE and run a ICICLE program. For simplicity, we will be using the ICICLE docker container as our environment, however, you may install the prerequisites on your machine and follow the same commands in your terminal.

-#### Setting up environment with Docker
+#### Setting up our environment

 Lets begin by cloning the ICICLE repository:

@@ -111,23 +105,29 @@ ICICLE Core is found under [`<project_root>/icicle`](https://github.com/ingonyam
 cd icicle
 ```

-For this example, we are going to compile ICICLE for a `bn254` curve. However other compilation strategies are supported.
+We are going to compile ICICLE for a specific curve

 ```sh
 mkdir -p build
 cmake -S . -B build -DCURVE=bn254 -DBUILD_TESTS=ON
-cmake --build build -j
+cmake --build build
 ```

-`-DBUILD_TESTS` option compiles the tests, without this flag `ctest` won't work.
-`-DCURVE` option tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/cmake/CurvesCommon.cmake#L2).
+`-DBUILD_TESTS=ON` compiles the tests, without this flag `ctest` won't work.
+`-DCURVE=bn254` tells the compiler which curve to build. You can find a list of supported curves [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves).

 The output in `build` folder should include the static libraries for the compiled curve.

+:::info
+
+Make sure to only use `-DBUILD_TESTS=ON` for running tests as the archive output will only be available when `-DBUILD_TESTS=ON` is not supplied.
+
+:::
+
 To run the test

 ```sh
-cd build/tests
+cd build
 ctest
 ```

@@ -169,24 +169,8 @@ Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](ht

 ### Running ICICLE examples

-ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/examples) these examples cover some simple use cases using C++, rust and golang.
+ICICLE examples can be found [here](https://github.com/ingonyama-zk/icicle-examples) these examples cover some simple use cases using C++, rust and golang.

-Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle/blob/main/examples/c%2B%2B/msm/example.cu).
-
-```sh
-cd examples/c++/msm
-./compile.sh
-./run.sh
-```
-
-:::tip
-
-Read through the compile.sh and CMakeLists.txt to understand how to link your own C++ project with ICICLE
-
-:::
-
-
-#### Running with Docker
 In each example directory, ZK-container files are located in a subdirectory `.devcontainer`.

 ```sh
@@ -196,6 +180,21 @@ msm/
   └── Dockerfile
 ```

+Lets run one of our C++ examples, in this case the [MSM example](https://github.com/ingonyama-zk/icicle-examples/blob/main/c%2B%2B/msm/example.cu).
+
+Clone the repository
+
+```sh
+git clone https://github.com/ingonyama-zk/icicle-examples.git
+cd icicle-examples
+```
+
+Enter the test directory
+
+```sh
+cd c++/msm
+```
+
 Now lets build our docker file and run the test inside it. Make sure you have installed the [optional prerequisites](#optional-prerequisites).

 ```sh
@@ -208,11 +207,54 @@ Lets start and enter the container
 docker run -it --rm --gpus all -v .:/icicle-example icicle-example-msm
 ```

-Inside the container you can run the same commands:
+to run the example

 ```sh
-./compile.sh
-./run.sh
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+./build/example
 ```

-You can now experiment with our other examples, perhaps try to run a rust or golang example next.
+You can now experiment with our other examples, perhaps try to run a rust or golang example next.
+
+## Writing new bindings for ICICLE
+
+Since ICICLE Core is written in CUDA / C++ its really simple to generate static libraries. These static libraries can be installed on any system and called by higher level languages such as Golang.
+
+static libraries can be loaded into memory once and used by multiple programs, reducing memory usage and potentially improving performance. They also allow you to separate functionality into distinct modules so your static library may need to compile only specific features that you want to use.
+
+Lets review the Golang bindings since its a pretty verbose example (compared to rust which hides it pretty well) of using static libraries. Golang has a library named `CGO` which can be used to link static libraries. Here's a basic example on how you can use cgo to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377 -lbw6_671
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+  // Now you can call the C functions from the ICICLE libraries.
+  // Note that C function calls are prefixed with 'C.' in Go code.
+
+  out := (*C.BN254_projective_t)(unsafe.Pointer(p))
+  in := (*C.BN254_affine_t)(unsafe.Pointer(affine))
+
+  C.projective_from_affine_bn254(out, in)
+}
+```
+
+The comments on the first line tell `CGO` which libraries to import as well as which header files to include. You can then call methods which are part of the static library and defined in the header file, `C.projective_from_affine_bn254` is an example.
+
+If you wish to create your own bindings for a language of your choice we suggest you start by investigating how you can call static libraries.
+
+### ICICLE Adapters
+
+One of the core ideas behind ICICLE is that developers can gradually accelerate their provers. Many protocols are written using other cryptographic libraries and completely replacing them may be complex and time consuming.
+
+Therefore we offer adapters for various popular libraries, these adapters allow us to convert points and scalars between different formats defined by various libraries. Here is a list:
+
+Golang adapters:
+
+- [Gnark crypto adapter](https://github.com/ingonyama-zk/iciclegnark)
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -1,8 +1,8 @@
 # What is ICICLE?

-[![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)
-
+[![Static Badge](https://img.shields.io/badge/Latest-v1.4.0-8a2be2)](https://github.com/ingonyama-zk/icicle/releases)

+![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue)



--- a/docs/docs/icicle/polynomials/ffi.uml
+++ b/docs/docs/icicle/polynomials/ffi.uml
@@ -1,27 +0,0 @@
-@startuml
-skinparam componentStyle uml2
-
-' Define Components
-component "C++ Template\nComponent" as CppTemplate {
-  [Parameterizable Interface]
-}
-component "C API Wrapper\nComponent" as CApiWrapper {
-  [C API Interface]
-}
-component "Rust Code\nComponent" as RustCode {
-  [Macro Interface\n(Template Instantiation)]
-}
-
-' Define Artifact
-artifact "Static Library\n«artifact»" as StaticLib
-
-' Connections
-CppTemplate -down-> CApiWrapper : Instantiates
-CApiWrapper .down.> StaticLib : Compiles into
-RustCode -left-> StaticLib : Links against\nand calls via FFI
-
-' Notes
-note right of CppTemplate : Generic C++\ntemplate implementation
-note right of CApiWrapper : Exposes C API for FFI\nto Rust/Go
-note right of RustCode : Uses macros to\ninstantiate templates
-@enduml
--- a/docs/docs/icicle/polynomials/hw_backends.uml
+++ b/docs/docs/icicle/polynomials/hw_backends.uml
@@ -1,86 +0,0 @@
-@startuml
-
-' Define Interface for Polynomial Backend Operations
-interface IPolynomialBackend {
-    +add()
-    +subtract()
-    +multiply()
-    +divide()
-    +evaluate()
-}
-
-' Define Interface for Polynomial Context (State Management)
-interface IPolynomialContext {
-    +initFromCoeffs()
-    +initFromEvals()
-    +getCoeffs()
-    +getEvals()
-}
-
-' PolynomialAPI now uses two strategies: Backend and Context
-class PolynomialAPI {
-    -backendStrategy: IPolynomialBackend
-    -contextStrategy: IPolynomialContext
-    -setBackendStrategy(IPolynomialBackend)
-    -setContextStrategy(IPolynomialContext)
-    +add()
-    +subtract()
-    +multiply()
-    +divide()
-    +evaluate()
-}
-
-' Backend Implementations
-class GPUPolynomialBackend implements IPolynomialBackend {
-    #gpuResources: Resource
-    +add()
-    +subtract()
-    +multiply()
-    +divide()
-    +evaluate()
-}
-
-class ZPUPolynomialBackend implements IPolynomialBackend {
-    #zpuResources: Resource
-    +add()
-    +subtract()
-    +multiply()
-    +divide()
-    +evaluate()
-}
-
-class TracerPolynomialBackend implements IPolynomialBackend {
-    #traceData: Data
-    +add()
-    +subtract()
-    +multiply()
-    +divide()
-    +evaluate()
-}
-
-' Context Implementations (Placeholder for actual implementation)
-class GPUContext implements IPolynomialContext {
-    +initFromCoeffs()
-    +initFromEvals()
-    +getCoeffs()
-    +getEvals()
-}
-
-class ZPUContext implements IPolynomialContext {
-    +initFromCoeffs()
-    +initFromEvals()
-    +getCoeffs()
-    +getEvals()
-}
-
-class TracerContext implements IPolynomialContext {
-    +initFromCoeffs()
-    +initFromEvals()
-    +getCoeffs()
-    +getEvals()
-}
-
-' Relationships
-PolynomialAPI o-- IPolynomialBackend : uses
-PolynomialAPI o-- IPolynomialContext : uses
-@enduml
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -1,373 +0,0 @@
-# Polynomial API Overview
-
-## Introduction
-
-The Polynomial API offers a robust framework for polynomial operations within a computational environment. It's designed for flexibility and efficiency, supporting a broad range of operations like arithmetic, evaluation, and manipulation, all while abstracting from the computation and storage specifics. This enables adaptability to various backend technologies, employing modern C++ practices.
-
-## Key Features
-
-### Backend Agnostic Architecture
-Our API is structured to be independent of any specific computational backend. While a CUDA backend is currently implemented, the architecture facilitates easy integration of additional backends. This capability allows users to perform polynomial operations without the need to tailor their code to specific hardware, enhancing code portability and scalability.
-
-### Templating in the Polynomial API
-
-The Polynomial API is designed with a templated structure to accommodate different data types for coefficients, the domain, and images. This flexibility allows the API to be adapted for various computational needs and types of data.
-
-```cpp
-template <typename Coeff, typename Domain = Coeff, typename Image = Coeff>
-class Polynomial {
-    // Polynomial class definition
-}
-```
-
-In this template:
-
- **`Coeff`**: Represents the type of the coefficients of the polynomial.
- **`Domain`**: Specifies the type for the input values over which the polynomial is evaluated. By default, it is the same as the type of the coefficients but can be specified separately to accommodate different computational contexts.
- **`Image`**: Defines the type of the output values of the polynomial. This is typically the same as the coefficients.
-
-#### Default instantiation
-```cpp
-extern template class Polynomial<scalar_t>;
-```
-
-#### Extended use cases
-The templated nature of the Polynomial API also supports more complex scenarios. For example, coefficients and images could be points on an elliptic curve (EC points), which are useful in cryptographic applications and advanced algebraic structures. This approach allows the API to be extended easily to support new algebraic constructions without modifying the core implementation.
-
-### Supported Operations
-The Polynomial class encapsulates a polynomial, providing a variety of operations:
- **Construction**: Create polynomials from coefficients or evaluations on roots-of-unity domains.
- **Arithmetic Operations**: Perform addition, subtraction, multiplication, and division.
- **Evaluation**: Directly evaluate polynomials at specific points or across a domain.
- **Manipulation**: Features like slicing polynomials, adding or subtracting monomials inplace, and computing polynomial degrees.
- **Memory Access**: Access internal states or obtain device-memory views of polynomials.
-
-## Usage
-
-This section outlines how to use the Polynomial API in C++. Bindings for Rust and Go are detailed under the Bindings sections.
-
-### Backend Initialization
-Initialization with an appropriate factory is required to configure the computational context and backend.
-
-```cpp
-#include "polynomials/polynomials.h"
-#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
-
-// Initialize with a CUDA backend
-Polynomial::initialize(std::make_shared<CUDAPolynomialFactory>());
-```
-
-:::note Icicle is built to a library per field/curve. Initialization must be done per library. That is, applications linking to multiple curves/fields should do it per curve/field.
-:::
-
-### Construction
-Polynomials can be constructed from coefficients, from evaluations on roots-of-unity domains, or by cloning existing polynomials.
-
-```cpp
-// Construction
-static Polynomial from_coefficients(const Coeff* coefficients, uint64_t nof_coefficients);
-static Polynomial from_rou_evaluations(const Image* evaluations, uint64_t nof_evaluations);
-// Clone the polynomial
-Polynomial clone() const;
-```
-
-Example:
-
-```cpp
-auto p_from_coeffs = Polynomial_t::from_coefficients(coeff /* :scalar_t* */, nof_coeffs);
-auto p_from_rou_evals = Polynomial_t::from_rou_evaluations(rou_evals /* :scalar_t* */, nof_evals);
-auto p_cloned = p.clone(); // p_cloned and p do not share memory
-```
-
-:::note
-The coefficients or evaluations may be allocated either on host or device memory. In both cases the memory is copied to backend device.
-:::
-
-### Arithmetic
-Constructed polynomials can be used for various arithmetic operations:
-
-```cpp
-// Addition
-Polynomial operator+(const Polynomial& rhs) const; 
-Polynomial& operator+=(const Polynomial& rhs); // inplace addition
-
-// Subtraction
-Polynomial operator-(const Polynomial& rhs) const;
-
-// Multiplication
-Polynomial operator*(const Polynomial& rhs) const;
-Polynomial operator*(const Domain& scalar) const; // scalar multiplication
-
-// Division A(x) = B(x)Q(x) + R(x)
-std::pair<Polynomial, Polynomial> divide(const Polynomial& rhs) const; // returns (Q(x), R(x))
-Polynomial operator/(const Polynomial& rhs) const; // returns quotient Q(x)
-Polynomial operator%(const Polynomial& rhs) const; // returns remainder R(x)
-Polynomial divide_by_vanishing_polynomial(uint64_t degree) const; // sdivision by the vanishing polynomial V(x)=X^N-1
-```
-
-#### Example:
-Given polynomials A(x),B(x),C(x) and V(x) the vanishing polynomial.
-
-$$
-H(x)=\frac{A(x) \cdot B(x) - C(x)}{V(x)} \space where \space V(x) = X^{N}-1
-$$
-
-```cpp
-auto H = (A*B-C).divide_by_vanishing_polynomial(N);
-```
-
-### Evaluation
-Evaluate polynomials at arbitrary domain points or across a domain.
-
-```cpp
-Image operator()(const Domain& x) const; // evaluate f(x)
-void evaluate(const Domain* x, Image* evals /*OUT*/) const;
-void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
-```
-
-Example:
-
-```cpp
-Coeff x = rand();
-Image f_x = f(x); // evaluate f at x
-
-// evaluate f(x) on a domain
-uint64_t domain_size = ...;
-auto domain = /*build domain*/; // host or device memory
-auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
-f.evaluate_on_domain(domain, domain_size, evaluations);
-```
-
-:::note For special domains such as roots of unity this method is not the most efficient for two reasons:
- Need to build the domain of size N.
- The implementation is not trying to identify this special domain.
-
-Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
-See the 'device views' section for more details.
-:::
-
-
-### Manipulations
-Beyond arithmetic, the API supports efficient polynomial manipulations:
-
-#### Monomials
-```cpp
-// Monomial operations
-Polynomial& add_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
-Polynomial& sub_monomial_inplace(Coeff monomial_coeff, uint64_t monomial = 0);
-```
-
-The ability to add or subtract monomials directly and in-place is an efficient way to manipualte polynomials.
-
-Example:
-```cpp
-f.add_monomial_in_place(scalar_t::from(5)); // f(x) += 5
-f.sub_monomial_in_place(scalar_t::from(3), 8); // f(x) -= 3x^8
-```
-
-#### Computing the degree of a Polynomial
-```cpp
-// Degree computation
-int64_t degree();
-```
-
-The degree of a polynomial is a fundamental characteristic that describes the highest power of the variable in the polynomial expression with a non-zero coefficient.
-The `degree()` function in the API returns the degree of the polynomial, corresponding to the highest exponent with a non-zero coefficient. 
-
- For the polynomial $f(x) = x^5 + 2x^3 + 4$, the degree is 5 because the highest power of $x$ with a non-zero coefficient is 5.
- For a scalar value such as a constant term (e.g., $f(x) = 7$, the degree is considered 0, as it corresponds to $x^0$.
- The degree of the zero polynomial, $f(x) = 0$, where there are no non-zero coefficients, is defined as -1. This special case often represents an "empty" or undefined state in many mathematical contexts.
-
-Example:
-```cpp
-auto f = /*some expression*/;
-auto degree_of_f = f.degree();
-```
-
-#### Slicing
-```cpp
-// Slicing and selecting even or odd components.
-Polynomial slice(uint64_t offset, uint64_t stride, uint64_t size = 0 /*0 means take all elements*/);
-Polynomial even();
-Polynomial odd();
-```
-
-The Polynomial API provides methods for slicing polynomials and selecting specific components, such as even or odd indexed terms. Slicing allows extracting specific sections of a polynomial based on an offset, stride, and size.
-
-The following examples demonstrate folding a polynomial's even and odd parts and arbitrary slicing;
-```cpp
-// folding a polynomials even and odd parts with randomness
-auto x = rand();
-auto even = f.even();
-auto odd = f.odd();
-auto fold_poly = even + odd * x;
-
-// arbitrary slicing (first quarter)
-auto first_quarter = f.slice(0 /*offset*/, 1 /*stride*/, f.degree()/4 /*size*/);
-```
-
-### Memory access (copy/view)
-Access to the polynomial's internal state can be vital for operations like commitment schemes or when more efficient custom operations are necessary. This can be done in one of two ways:
- **Copy** the coefficients or evaluations to user allocated memory or
- **View** into the device memory without copying.
-
-#### Copy
-Copy the polynomial coefficients to either host or device allocated memory.
-:::note copying to host memory is backend agnostic while copying to device memory requires the memory to be allocated on the corresponding backend.
-:::
-
-```cpp
-Coeff get_coeff(uint64_t idx) const; // copy single coefficient to host
-uint64_t copy_coeffs(Coeff* coeffs, uint64_t start_idx, uint64_t end_idx) const;
-```
-
-Example:
-```cpp
-auto coeffs_device = /*allocate CUDA or host memory*/
-f.copy_coeffs(coeffs_device, 0/*start*/, f.degree());
-  
-MSMConfig cfg = msm::defaultMSMConfig();
-cfg.are_points_on_device = true; // assuming copy to device memory
-auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);
-```
-
-#### Views
-The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations, without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
-
-##### What is a Memory View?
-
-A memory view is essentially a pointer to data stored in device memory. By providing a direct access pathway to the data, it eliminates the need for data duplication, thus conserving both time and system resources. This is especially beneficial in high-performance computing environments where data size and operation speed are critical factors.
-
-##### Applications of Memory Views
-
-Memory views are extremely versatile and can be employed in various computational contexts such as:
-
- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
- **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.
-
-##### Obtaining and Using Views
-
-To create and use views within the Polynomial API, functions are provided to obtain pointers to both coefficients and evaluation data. Here’s how they are generally structured:
-
-```cpp
-// Obtain a view of the polynomial's coefficients
-std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
-std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
-```
-
-Example usage:
-
-```cpp
-auto [coeffs_view, size, device_id] = polynomial.get_coefficients_view();
-
-// Use coeffs_view in a computational routine that requires direct access to polynomial coefficients
-// Example: Passing the view to a GPU-accelerated function
-gpu_accelerated_function(coeffs_view.get(),...);
-```
-
-##### Integrity-Pointer: Managing Memory Views
-Within the Polynomial API, memory views are managed through a specialized tool called the Integrity-Pointer. This pointer type is designed to safeguard operations by monitoring the validity of the memory it points to. It can detect if the memory has been modified or released, thereby preventing unsafe access to stale or non-existent data.
-The Integrity-Pointer not only acts as a regular pointer but also provides additional functionality to ensure the integrity of the data it references. Here are its key features:
-
-```cpp
-// Checks whether the pointer is still considered valid
-bool isValid() const;
-
-// Retrieves the raw pointer or nullptr if pointer is invalid
-const T* get() const;
-
-// Dereferences the pointer. Throws exception if the pointer is invalid.
-const T& operator*() const;
-
-//Provides access to the member of the pointed-to object. Throws exception if the pointer is invalid.
-const T* operator->() const;
-```
-
-Consider the Following case:
-
-```cpp
-auto [coeff_view, size, device] = f.get_coefficients_view();
-
-// Use the coefficients view to perform external operations
-commit_to_polynomial(coeff_view.get(), size);
-
-// Modification of the original polynomial
-f += g; // Any operation that modifies 'f' potentially invalidates 'coeff_view'
-
-// Check if the view is still valid before using it further
-if (coeff_view.isValid()) {
-    perform_additional_computation(coeff_view.get(), size);
-} else {
-    handle_invalid_data();
-}
-```
-
-#### Evaluations View: Accessing Polynomial Evaluations Efficiently
-The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
-This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
-:::warning
-Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
-:::
-
-```cpp
-// Assume a polynomial `p` of degree N
-auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
-
-// Use the evaluations view to perform further computations or visualizations
-process_polynomial_evaluations(evals_view.get(), size, device_id);
-```
-
-## Multi-GPU Support with CUDA Backend
-
-The Polynomial API includes comprehensive support for multi-GPU environments, a crucial feature for leveraging the full computational power of systems equipped with multiple NVIDIA GPUs. This capability is part of the API's CUDA backend, which is designed to efficiently manage polynomial computations across different GPUs.
-
-### Setting the CUDA Device
-
-Like other components of the icicle framework, the Polynomial API allows explicit setting of the current CUDA device:
-
-```cpp
-cudaSetDevice(int deviceID);
-```
-
-This function sets the active CUDA device. All subsequent operations that allocate or deal with polynomial data will be performed on this device.
-
-### Allocation Consistency
-Polynomials are always allocated on the current CUDA device at the time of their creation. It is crucial to ensure that the device context is correctly set before initiating any operation that involves memory allocation:
-```cpp
-// Set the device before creating polynomials
-cudaSetDevice(0);
-Polynomial p1 = Polynomial::from_coefficients(coeffs, size);
-
-cudaSetDevice(1);
-Polynomial p2 = Polynomial::from_coefficients(coeffs, size);
-```
-
-### Matching Devices for Operations
-When performing operations that result in the creation of new polynomials (such as addition or multiplication), it is imperative that both operands are on the same CUDA device. If the operands reside on different devices, an exception is thrown:
-
-```cpp
-// Ensure both operands are on the same device
-cudaSetDevice(0);
-auto p3 = p1 + p2; // Throws an exception if p1 and p2 are not on the same device
-```
-
-### Device-Agnostic Operations
-Operations that do not involve the creation of new polynomials, such as computing the degree of a polynomial or performing in-place modifications, can be executed regardless of the current device setting:
-```cpp
-// 'degree' and in-place operations do not require device matching
-int deg = p1.degree();
-p1 += p2; // Valid if p1 and p2 are on the same device, throws otherwise
-```
-
-### Error Handling
-The API is designed to throw exceptions if operations are attempted across polynomials that are not located on the same GPU. This ensures that all polynomial operations are performed consistently and without data integrity issues due to device mismatches.
-
-### Best Practices
-To maximize the performance and avoid runtime errors in a multi-GPU setup, always ensure that:
-
- The CUDA device is set correctly before polynomial allocation.
- Operations involving new polynomial creation are performed with operands on the same device.
-
-By adhering to these guidelines, developers can effectively harness the power of multiple GPUs to handle large-scale polynomial computations efficiently.
--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -49,17 +49,13 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per

 You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).

+# Using MSM
+
 ## Supported curves

 MSM supports the following curves:

-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-
-## Supported Bindings
-
- [Golang](../golang-bindings/msm.md)
- [Rust](../rust-bindings//msm.md)
+`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`

 ## Supported algorithms

@@ -83,6 +79,25 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu

 The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.

+
+### How do I toggle between the supported algorithms?
+
+When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
+
+```rust
+...
+
+let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+
+// is_big_triangle will determine which algorithm to use 
+cfg_bls12377.is_big_triangle = true;
+
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+...
+```
+
+You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
+
 ## MSM Modes

 ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
@@ -94,3 +109,54 @@ Batch MSM allows you to run many MSMs with a single API call, Single MSM will la
 This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.

 Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
+
+### How do I toggle between MSM modes?
+
+Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
+
+```rust
+...
+
+let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
+
+...
+```
+
+In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
+
+
+In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
+
+```rust
+...
+
+let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+
+...
+```
+
+Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
+
+
+## Support for G2 group
+
+MSM also supports G2 group. 
+
+Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
+
+```rust
+... 
+
+let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
+
+msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+
+...
+```
+
+Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -28,10 +28,6 @@ NTT supports the following curves:

 `bls12-377`, `bls12-381`, `bn-254`, `bw6-761`

-## Supported Bindings
-
- [Golang](../golang-bindings/ntt.md)
- [Rust](../rust-bindings/ntt.md)

 ### Examples

@@ -39,6 +35,87 @@ NTT supports the following curves:

 - [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)

+## NTT API overview
+
+```rust
+pub fn ntt<F>(
+    input: &HostOrDeviceSlice<F>,
+    dir: NTTDir,
+    cfg: &NTTConfig<F>,
+    output: &mut HostOrDeviceSlice<F>,
+) -> IcicleResult<()>
+```
+
+`ntt:ntt` expects:
+
+`input` - buffer to read the inputs of the NTT from. <br/>
+`dir` - whether to compute forward or inverse NTT. <br/>
+`cfg` - config used to specify extra arguments of the NTT. <br/>
+`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
+
+The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
+
+### NTT Config
+
+```rust
+pub struct NTTConfig<'a, S> {
+    pub ctx: DeviceContext<'a>,
+    pub coset_gen: S,
+    pub batch_size: i32,
+    pub ordering: Ordering,
+    are_inputs_on_device: bool,    
+    are_outputs_on_device: bool,
+    pub is_async: bool,
+    pub ntt_algorithm: NttAlgorithm,
+}
+```
+
+The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
+
+#### Fields
+
+- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
+
+- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
+
+- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
+
+- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
+
+- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
+
+- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
+
+- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
+
+- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
+`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
+`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
+
+
+#### Usage
+
+Example initialization with default settings:
+
+```rust
+let default_config = NTTConfig::default();
+```
+
+Customizing the configuration:
+
+```rust
+let custom_config = NTTConfig {
+    ctx: custom_device_context,
+    coset_gen: my_coset_generator,
+    batch_size: 10,
+    ordering: Ordering::kRN,
+    are_inputs_on_device: true,
+    are_outputs_on_device: true,
+    is_async: false,
+    ntt_algorithm: NttAlgorithm::MixedRadix,
+};
+```
+
 ### Ordering

 The `Ordering` enum defines how inputs and outputs are arranged for the NTT operation, offering flexibility in handling data according to different algorithmic needs or compatibility requirements. It primarily affects the sequencing of data points for the transform, which can influence both performance and the compatibility with certain algorithmic approaches. The available ordering options are:
@@ -63,6 +140,15 @@ NTT also supports two different modes `Batch NTT` and `Single NTT`

 Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.

+You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
+
+```rust
+let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+cfg.batch_size = 10 // your ntt using this config will run in batch mode.
+```
+
+`batch_size=1` would keep our NTT in single NTT mode.
+
 Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.

 **Single NTT Mode**
@@ -146,11 +232,9 @@ Mixed Radix can reduce the number of stages required to compute for large inputs

 ### Which algorithm should I choose ?

-Both work only on inputs of power of 2 (e.g., 256, 512, 1024).
+Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Its also more suited for inputs which are power of 2 (e.g., 256, 512, 1024). Radix 2 won't necessarily perform better for smaller `logn` with larger batches.

-Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
-
-Mixed radix on the other hand works better for larger NTTs with larger input sizes.
+Mixed radix on the other hand better for larger NTTs with larger input sizes which are not necessarily power of 2.

 Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -6,6 +6,5 @@ This section of the documentation is dedicated to the ICICLE primitives, we will
 ## Supported primitives


- [MSM](./msm.md)
- [NTT](./ntt.md)
+- [MSM](./msm)
 - [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/rust-bindings/ecntt.md
+++ b/docs/docs/icicle/rust-bindings/ecntt.md
@@ -1,35 +0,0 @@
-# ECNTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`
-
-## ECNTT Method
-
-The `ecntt` function computes the Elliptic Curve Number Theoretic Transform (EC-NTT) or its inverse on a batch of points of a curve.
-
-```rust
-pub fn ecntt<C: Curve>(
-    input: &(impl HostOrDeviceSlice<Projective<C>> + ?Sized),
-    dir: NTTDir,
-    cfg: &NTTConfig<C::ScalarField>,
-    output: &mut (impl HostOrDeviceSlice<Projective<C>> + ?Sized),
-) -> IcicleResult<()>
-where
-    C::ScalarField: FieldImpl,
-    <C::ScalarField as FieldImpl>::Config: ECNTT<C>,
-{
-    // ... function implementation ...
-}
-```
-
-## Parameters
-
- **`input`**: The input data as a slice of `Projective<C>`. This represents points on a specific elliptic curve `C`. 
- **`dir`**: The direction of the NTT. It can be `NTTDir::kForward` for forward NTT or `NTTDir::kInverse` for inverse NTT.
- **`cfg`**: The NTT configuration object of type `NTTConfig<C::ScalarField>`. This object specifies parameters for the NTT computation, such as the batch size and algorithm to use.
- **`output`**: The output buffer to write the results into. This should be a slice of `Projective<C>` with the same size as the input.
-
-## Return Value
-
- **`IcicleResult<()>`**: This function returns an `IcicleResult` which is a wrapper type that indicates success or failure of the NTT computation. On success, it contains `Ok(())`.
--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -1,63 +0,0 @@
-# MSM Pre computation
-
-To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `Grumpkin`
-
-### `precompute_bases`
-
-Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.
-
-```rust
-pub fn precompute_bases<C: Curve + MSM<C>>(
-    points: &HostOrDeviceSlice<Affine<C>>,
-    precompute_factor: i32,
-    _c: i32,
-    ctx: &DeviceContext,
-    output_bases: &mut HostOrDeviceSlice<Affine<C>>,
-) -> IcicleResult<()>
-```
-
-
-#### Parameters
-
- **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
- **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.
-
-#### Returns
-
-`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
-
-#### Description
-
-This function extends each provided base point $(P)$ with its multiples $(2^lP, 2^{2l}P, ..., 2^{(precompute_factor - 1) \cdot l}P)$, where $(l)$ is a level of precomputation determined by the `precompute_factor`. The extended set of points facilitates faster MSM computations by allowing the MSM algorithm to leverage precomputed multiples of base points, reducing the number of point additions required during the computation.
-
-The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.
-
-#### Example Usage
-
-```rust
-let device_context = DeviceContext::default_for_device(0); // Use the default device
-let precompute_factor = 4; // Number of points to precompute
-let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");
-
-// Precompute the bases using the specified factor
-precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
-    .expect("Failed to precompute bases");
-```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -1,172 +0,0 @@
-# MSM
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Example
-
-```rust
-use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarCfg};
-use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
-    let size: usize = 1 << 10; // Define the number of points and scalars
-
-    // Generate random points and scalars
-    println!("Generating random G1 points and scalars for BN254...");
-    let points = CurveCfg::generate_random_affine_points(size);
-    let scalars = ScalarCfg::generate_random(size);
-
-    // Wrap points and scalars in HostOrDeviceSlice for MSM
-    let points_host = HostOrDeviceSlice::Host(points);
-    let scalars_host = HostOrDeviceSlice::Host(scalars);
-
-    // Allocate memory on the CUDA device for MSM results
-    let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).expect("Failed to allocate CUDA memory for MSM results");
-
-    // Create a CUDA stream for asynchronous execution
-    let stream = CudaStream::create().expect("Failed to create CUDA stream");
-    let mut cfg = msm::MSMConfig::default();
-    cfg.ctx.stream = &stream;
-    cfg.is_async = true; // Enable asynchronous execution
-
-    // Execute MSM on the device
-    println!("Executing MSM on device...");
-    msm::msm(&scalars_host, &points_host, &cfg, &mut msm_results).expect("Failed to execute MSM");
-
-    // Synchronize CUDA stream to ensure MSM execution is complete
-    stream.synchronize().expect("Failed to synchronize CUDA stream");
-
-    // Optionally, move results to host for further processing or printing
-    println!("MSM execution complete.");
-}
-```
-
-## MSM API Overview
-
-```rust
-pub fn msm<C: Curve>(
-    scalars: &HostOrDeviceSlice<C::ScalarField>,
-    points: &HostOrDeviceSlice<Affine<C>>,
-    cfg: &MSMConfig,
-    results: &mut HostOrDeviceSlice<Projective<C>>,
-) -> IcicleResult<()>
-```
-
-### Parameters
-
- **`scalars`**: A buffer containing the scalar values to be multiplied with corresponding points.
- **`points`**: A buffer containing the points to be multiplied by the scalars.
- **`cfg`**: MSM configuration specifying additional parameters for the operation.
- **`results`**: A buffer where the results of the MSM operations will be stored.
-
-### MSM Config
-
-```rust
-pub struct MSMConfig<'a> {
-    pub ctx: DeviceContext<'a>,
-    points_size: i32,
-    pub precompute_factor: i32,
-    pub c: i32,
-    pub bitsize: i32,
-    pub large_bucket_factor: i32,
-    batch_size: i32,
-    are_scalars_on_device: bool,
-    pub are_scalars_montgomery_form: bool,
-    are_points_on_device: bool,
-    pub are_points_montgomery_form: bool,
-    are_results_on_device: bool,
-    pub is_big_triangle: bool,
-    pub is_async: bool,
-}
-```
-
- **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
- **`point_size: i32`**: 
- **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
- **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
- **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
- **`large_bucket_factor: i32`**: Adjusts the algorithm's sensitivity to frequently occurring buckets, useful for non-uniform scalar distributions.
- **`batch_size: i32`**: The number of MSMs to compute in a single batch, for leveraging parallelism.
- **`are_scalars_montgomery_form`**: Set to `true` if scalars are in montgomery form.
- **`are_points_montgomery_form`**: Set to `true` if points are in montgomery form.
- **`are_scalars_on_device: bool`**, **`are_points_on_device: bool`**, **`are_results_on_device: bool`**: Indicate whether the corresponding buffers are on the device memory.
- **`is_big_triangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`is_async: bool`**: Whether to perform the MSM operation asynchronously.
-
-### Usage
-
-The `msm` function is designed to compute the sum of multiple scalar-point multiplications efficiently. It supports both single MSM operations and batched operations for increased performance. The configuration allows for detailed control over the execution environment and performance characteristics of the MSM operation.
-
-When performing MSM operations, it's crucial to match the size of the `scalars` and `points` arrays correctly and ensure that the `results` buffer is appropriately sized to hold the output. The `MSMConfig` should be set up to reflect the specifics of the operation, including whether the operation should be asynchronous and any device-specific settings.
-
-## How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
-
-```rust
-...
-
-let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
-
-// is_big_triangle will determine which algorithm to use 
-cfg_bls12377.is_big_triangle = true;
-
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-...
-```
-
-You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
-
-
-## How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
-
-```rust
-...
-
-let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
-
-...
-```
-
-In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
-
-
-In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
-
-```rust
-...
-
-let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-
-...
-```
-
-Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
-
-## Support for G2 group
-
-MSM also supports G2 group. 
-
-Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
-
-```rust
-... 
-
-let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
-
-msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
-
-...
-```
-
-Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -4,54 +4,6 @@ To learn more about the theory of Multi GPU programming refer to [this part](../

 Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)

-
-## A Multi GPU example
-
-In this example we will display how you can
-
-1. Fetch the number of devices installed on a machine
-2. For every GPU launch a thread and set an active device per thread.
-3. Execute a MSM on each GPU
-
-
-
-```rust
-
-...
-
-let device_count = get_device_count().unwrap();
-
-(0..device_count)
-        .into_par_iter()
-        .for_each(move |device_id| {
-          set_device(device_id).unwrap();
-
-          // you can allocate points and scalars_d here
-
-          let mut cfg = MSMConfig::default_for_device(device_id);
-          cfg.ctx.stream = &stream;
-          cfg.is_async = true;
-          cfg.are_scalars_montgomery_form = true;
-          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
-
-          // collect and process results
-        })
-
-...
-```
-
-
-We use `get_device_count` to fetch the number of connected devices, device IDs will be `0, 1, 2, ..., device_count - 1`
-
-[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
-
-We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
-
-Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
-
-We finally call our `msm` method.
-
-
 ## Device management API

 To streamline device management we offer as part of `icicle-cuda-runtime` package methods for dealing with devices.
@@ -62,11 +14,11 @@ Sets the current CUDA device by its ID, when calling `set_device` it will set th

 **Parameters:**

- **`device_id: usize`**: The ID of the device to set as the current device. Device IDs start from 0.
+- `device_id: usize`: The ID of the device to set as the current device. Device IDs start from 0.

 **Returns:**

- **`CudaResult<()>`**: An empty result indicating success if the device is set successfully. In case of failure, returns a `CudaError`.
+- `CudaResult<()>`: An empty result indicating success if the device is set successfully. In case of failure, returns a `CudaError`.

 **Errors:**

@@ -88,7 +40,7 @@ Retrieves the number of CUDA devices available on the machine.

 **Returns:**

- **`CudaResult<usize>`**: The number of available CUDA devices. On success, contains the count of CUDA devices. On failure, returns a `CudaError`.
+- `CudaResult<usize>`: The number of available CUDA devices. On success, contains the count of CUDA devices. On failure, returns a `CudaError`.

 **Errors:**

@@ -109,7 +61,7 @@ Retrieves the ID of the current CUDA device.

 **Returns:**

- **`CudaResult<usize>`**: The ID of the current CUDA device. On success, contains the device ID. On failure, returns a `CudaError`.
+- `CudaResult<usize>`: The ID of the current CUDA device. On success, contains the device ID. On failure, returns a `CudaError`.

 **Errors:**

@@ -191,7 +143,7 @@ Validates that the specified `device_id` matches the ID of the currently active

 #### Behavior

- **`Panics`** if the `device_id` does not match the active device's ID, preventing cross-device operation errors.
+- **Panics** if the `device_id` does not match the active device's ID, preventing cross-device operation errors.

 #### Example

@@ -200,3 +152,50 @@ let device_id: i32 = 0; // Example device ID
 check_device(device_id);
 // Ensures that the current context is correctly set for the specified device ID.
 ```
+
+
+## A Multi GPU example
+
+In this example we will display how you can
+
+1. Fetch the number of devices installed on a machine
+2. For every GPU launch a thread and set a active device per thread.
+3. Execute a MSM on each GPU
+
+
+
+```rust
+
+...
+
+let device_count = get_device_count().unwrap();
+
+(0..device_count)
+        .into_par_iter()
+        .for_each(move |device_id| {
+          set_device(device_id).unwrap();
+
+          // you can allocate points and scalars_d here
+
+          let mut cfg = MSMConfig::default_for_device(device_id);
+          cfg.ctx.stream = &stream;
+          cfg.is_async = true;
+          cfg.are_scalars_montgomery_form = true;
+          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
+
+          // collect and process results
+        })
+
+...
+```
+
+
+We use `get_device_count` to fetch the number of connected devices, device IDs will be `0...device_count-1`
+
+[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
+
+We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
+
+Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
+
+We finally call our `msm` method.
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -1,208 +0,0 @@
-# NTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Example 
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::{ntt::{self, NTT}, traits::GenerateRandom};
-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
-    let size = 1 << 12; // Define the size of your input, e.g., 2^10
-
-    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
-        size.try_into()
-            .unwrap(),
-    )
-
-    // Generate random inputs
-    println!("Generating random inputs...");
-    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
-
-    // Allocate memory on CUDA device for NTT results
-    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).expect("Failed to allocate CUDA memory");
-
-    // Create a CUDA stream
-    let stream = CudaStream::create().expect("Failed to create CUDA stream");
-    let ctx = DeviceContext::default(); // Assuming default device context
-    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
-
-    // Configure NTT
-    let mut cfg = ntt::NTTConfig::default();
-    cfg.ctx.stream = &stream;
-    cfg.is_async = true; // Set to true for asynchronous execution
-
-    // Execute NTT on device
-    println!("Executing NTT on device...");
-    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).expect("Failed to execute NTT");
-
-    // Synchronize CUDA stream to ensure completion
-    stream.synchronize().expect("Failed to synchronize CUDA stream");
-
-    // Optionally, move results to host for further processing or verification
-    println!("NTT execution complete.");
-}
-```
-
-## NTT API overview
-
-```rust
-pub fn ntt<F>(
-    input: &HostOrDeviceSlice<F>,
-    dir: NTTDir,
-    cfg: &NTTConfig<F>,
-    output: &mut HostOrDeviceSlice<F>,
-) -> IcicleResult<()>
-```
-
-`ntt:ntt` expects:
-
- **`input`** - buffer to read the inputs of the NTT from. <br/>
- **`dir`** - whether to compute forward or inverse NTT. <br/>
- **`cfg`** - config used to specify extra arguments of the NTT. <br/>
- **`output`** - buffer to write the NTT outputs into. Must be of the same  size as input.
-
-The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
-
-
-### NTT Config
-
-```rust
-pub struct NTTConfig<'a, S> {
-    pub ctx: DeviceContext<'a>,
-    pub coset_gen: S,
-    pub batch_size: i32,
-    pub columns_batch: bool,
-    pub ordering: Ordering,
-    are_inputs_on_device: bool,    
-    are_outputs_on_device: bool,
-    pub is_async: bool,
-    pub ntt_algorithm: NttAlgorithm,
-}
-```
-
-The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
-
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
-
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
-
- **`columns_batch`**: If true the function will compute the NTTs over the columns of the input matrix and not over the rows. Defaults to `false`.
-
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
-
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
-
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
-
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
-
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
-`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
-
-#### Usage
-
-Example initialization with default settings:
-
-```rust
-let default_config = NTTConfig::default();
-```
-
-Customizing the configuration:
-
-```rust
-let custom_config = NTTConfig {
-    ctx: custom_device_context,
-    coset_gen: my_coset_generator,
-    batch_size: 10,
-    columns_batch: false,
-    ordering: Ordering::kRN,
-    are_inputs_on_device: true,
-    are_outputs_on_device: true,
-    is_async: false,
-    ntt_algorithm: NttAlgorithm::MixedRadix,
-};
-```
-
-
-### Modes
-
-NTT supports two different modes `Batch NTT` and `Single NTT`
-
-You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
-
-```rust
-let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
-cfg.batch_size = 10 // your ntt using this config will run in batch mode.
-```
-
-`batch_size=1` would keep our NTT in single NTT mode.
-
-Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
-
-### Initializing the NTT Domain
-
-Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.
-
-```rust
-ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
-```
-
-### `initialize_domain`
-
-```rust
-pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext, fast_twiddles: bool) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: NTT<F>;
-```
-
-#### Parameters
-
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
- **`IcicleResult<()>`**: Will return an error if the operation fails.
-
-#### Parameters
-
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
- **`IcicleResult<()>`**: Will return an error if the operation fails.
-
-### Releaseing the domain
-
-The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.
-
-```rust
-pub fn release_domain<F>(ctx: &DeviceContext) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: NTT<F>
-```
-
-#### Parameters
-
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
-The function returns an `IcicleResult<()>`, which represents the result of the operation. If the operation is successful, the function returns `Ok(())`, otherwise it returns an error.
-
--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -1,261 +0,0 @@
-:::note Please refer to the Polynomials overview page for a deep overview. This section is a brief description of the Rust FFI bindings. 
-:::
-
-# Rust FFI Bindings for Univariate Polynomial
-This documentation is designed to provide developers with a clear understanding of how to utilize the Rust bindings for polynomial operations efficiently and effectively, leveraging the robust capabilities of both Rust and C++ in their applications.
-
-## Introduction
-The Rust FFI bindings for the Univariate Polynomial serve as a "shallow wrapper" around the underlying C++ implementation. These bindings provide a straightforward Rust interface that directly calls functions from a C++ library, effectively bridging Rust and C++ operations. The Rust layer handles simple interface translations without delving into complex logic or data structures, which are managed on the C++ side. This design ensures efficient data handling, memory management, and execution of polynomial operations directly via C++.
-Currently, these bindings are tailored specifically for polynomials where the coefficients, domain, and images are represented as scalar fields.
-
-
-## Initialization Requirements
-
-Before utilizing any functions from the polynomial API, it is mandatory to initialize the appropriate polynomial backend (e.g., CUDA). Additionally, the NTT (Number Theoretic Transform) domain must also be initialized, as the CUDA backend relies on this for certain operations. Failing to properly initialize these components can result in errors.
-
-:::note
-**Field-Specific Initialization Requirement**
-
-The ICICLE library is structured such that each field or curve has its dedicated library implementation. As a result, initialization must be performed individually for each field or curve to ensure the correct setup and functionality of the library.
-:::
-
-
-## Core Trait: `UnivariatePolynomial`
-
-The `UnivariatePolynomial` trait encapsulates the essential functionalities required for managing univariate polynomials in the Rust ecosystem. This trait standardizes the operations that can be performed on polynomials, regardless of the underlying implementation details. It allows for a unified approach to polynomial manipulation, providing a suite of methods that are fundamental to polynomial arithmetic.
-
-### Trait Definition
-```rust
-pub trait UnivariatePolynomial
-where
-    Self::Field: FieldImpl,
-    Self::FieldConfig: FieldConfig,
-{
-    type Field: FieldImpl;
-    type FieldConfig: FieldConfig;
-
-    // Methods to create polynomials from coefficients or roots-of-unity evaluations.
-    fn from_coeffs<S: HostOrDeviceSlice<Self::Field> + ?Sized>(coeffs: &S, size: usize) -> Self;
-    fn from_rou_evals<S: HostOrDeviceSlice<Self::Field> + ?Sized>(evals: &S, size: usize) -> Self;
-
-    // Method to divide this polynomial by another, returning quotient and remainder.
-    fn divide(&self, denominator: &Self) -> (Self, Self) where Self: Sized;
-
-    // Method to divide this polynomial by the vanishing polynomial 'X^N-1'.
-    fn div_by_vanishing(&self, degree: u64) -> Self;
-
-    // Methods to add or subtract a monomial in-place.
-    fn add_monomial_inplace(&mut self, monomial_coeff: &Self::Field, monomial: u64);
-    fn sub_monomial_inplace(&mut self, monomial_coeff: &Self::Field, monomial: u64);
-
-    // Method to slice the polynomial, creating a sub-polynomial.
-    fn slice(&self, offset: u64, stride: u64, size: u64) -> Self;
-
-    // Methods to return new polynomials containing only the even or odd terms.
-    fn even(&self) -> Self;
-    fn odd(&self) -> Self;
-
-    // Method to evaluate the polynomial at a given domain point.
-    fn eval(&self, x: &Self::Field) -> Self::Field;
-
-    // Method to evaluate the polynomial over a domain and store the results.
-    fn eval_on_domain<D: HostOrDeviceSlice<Self::Field> + ?Sized, E: HostOrDeviceSlice<Self::Field> + ?Sized>(
-        &self,
-        domain: &D,
-        evals: &mut E,
-    );
-
-    // Method to retrieve a coefficient at a specific index.
-    fn get_coeff(&self, idx: u64) -> Self::Field;
-
-    // Method to copy coefficients into a provided slice.
-    fn copy_coeffs<S: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, start_idx: u64, coeffs: &mut S);
-
-    // Method to get the degree of the polynomial.
-    fn degree(&self) -> i64;
-}
-```
-
-## `DensePolynomial` Struct
-The DensePolynomial struct represents a dense univariate polynomial in Rust, leveraging a handle to manage its underlying memory within the CUDA device context. This struct acts as a high-level abstraction over complex C++ memory management practices, facilitating the integration of high-performance polynomial operations through Rust's Foreign Function Interface (FFI) bindings.
-
-```rust
-pub struct DensePolynomial {
-    handle: PolynomialHandle,
-}
-```
-
-### Traits implementation and methods
-
-#### `Drop`
-Ensures proper resource management by releasing the CUDA memory when a DensePolynomial instance goes out of scope. This prevents memory leaks and ensures that resources are cleaned up correctly, adhering to Rust's RAII (Resource Acquisition Is Initialization) principles.
-
-#### `Clone`
-Provides a way to create a new instance of a DensePolynomial with its own unique handle, thus duplicating the polynomial data in the CUDA context. Cloning is essential since the DensePolynomial manages external resources, which cannot be safely shared across instances without explicit duplication.
-
-#### Operator Overloading: `Add`, `Sub`, `Mul`, `Rem`, `Div`
-These traits are implemented for references to DensePolynomial (i.e., &DensePolynomial), enabling natural mathematical operations such as addition (+), subtraction (-), multiplication (*), division (/), and remainder (%). This syntactic convenience allows users to compose complex polynomial expressions in a way that is both readable and expressive.
-
-#### Key Methods
-In addition to the traits, the following methods are implemented:
-
-```rust
-impl DensePolynomial {
-    pub fn init_cuda_backend() -> bool {...}
-    // Returns a mutable slice of the polynomial coefficients on the device
-    pub fn coeffs_mut_slice(&mut self) -> &mut DeviceSlice<F> {...}
-}      
-```
-
-:::note Might be consolidated with `UnivariatePolynomial` trait
-:::
-
-## Flexible Memory Handling With `HostOrDeviceSlice`
-The DensePolynomial API is designed to accommodate a wide range of computational environments by supporting both host and device memory through the `HostOrDeviceSlice` trait. This approach ensures that polynomial operations can be seamlessly executed regardless of where the data resides, making the API highly adaptable and efficient for various hardware configurations.
-
-### Overview of `HostOrDeviceSlice`
-The HostOrDeviceSlice is a Rust trait that abstracts over slices of memory that can either be on the host (CPU) or the device (GPU), as managed by CUDA. This abstraction is crucial for high-performance computing scenarios where data might need to be moved between different memory spaces depending on the operations being performed and the specific hardware capabilities available.
-
-### Usage in API Functions
-Functions within the DensePolynomial API that deal with polynomial coefficients or evaluations use the HostOrDeviceSlice trait to accept inputs. This design allows the functions to be agnostic of the actual memory location of the data, whether it's in standard system RAM accessible by the CPU or in GPU memory accessible by CUDA cores.
-
-```rust
-// Assume `coeffs` could either be in host memory or CUDA device memory
-let coeffs: DeviceSlice<F> = DeviceVec::<F>::cuda_malloc(coeffs_len).unwrap();
-let p_from_coeffs = PolynomialBabyBear::from_coeffs(&coeffs, coeffs.len());
-
-// Similarly for evaluations from roots of unity
-let evals: HostSlice<F> = HostSlice::from_slice(&host_memory_evals);
-let p_from_evals = PolynomialBabyBear::from_rou_evals(&evals, evals.len());
-
-// Same applies for any API that accepts HostOrDeviceSlice
-```
-
-## Usage
-This section outlines practical examples demonstrating how to utilize the `DensePolynomial` Rust API. The API is flexible, supporting multiple scalar fields. Below are examples showing how to use polynomials defined over different fields and perform a variety of operations.
-
-### Initialization and Basic Operations
-First, choose the appropriate field implementation for your polynomial operations, initializing the CUDA backend if necessary
-```rust
-use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
-
-// Initialize the CUDA backend for polynomial operations
-PolynomialBabyBear::init_cuda_backend();
-let f = PolynomialBabyBear::from_coeffs(...);
-
-// now use f by calling the implemented traits
-
-// For operations over another field, such as BN254
-use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
-// Use PolynomialBn254 similarly
-```
-
-### Creation
-Polynomials can be created from coefficients or evaluations:
-
-```rust
-// Assume F is the field type (e.g. icicle_bn254::curve::ScalarField or a type parameter)
-let coeffs = ...;
-let p_from_coeffs = PolynomialBabyBear::from_coeffs(HostSlice::from_slice(&coeffs), size);
-
-let evals = ...;
-let p_from_evals = PolynomialBabyBear::from_rou_evals(HostSlice::from_slice(&evals), size);
-
-```
-
-### Arithmetic Operations
-Utilize overloaded operators for intuitive mathematical expressions:
-
-```rust
-let add = &f + &g;  // Addition
-let sub = &f - &g;  // Subtraction
-let mul = &f * &g;  // Multiplication
-let mul_scalar = &f * &scalar;  // Scalar multiplication
-```
-
-### Division and Remainder
-Compute quotient and remainder or perform division by a vanishing polynomial:
-
-```rust
-let (q, r) = f.divide(&g);  // Compute both quotient and remainder
-let q = &f / &g;  // Quotient
-let r = &f % &g;  // Remainder
-
-let h = f.div_by_vanishing(N);  // Division by V(x) = X^N - 1
-
-```
-
-### Monomial Operations
-Add or subtract monomials in-place for efficient polynomial manipulation:
-
-```rust
-f.add_monomial_inplace(&three, 1 /*monmoial*/); // Adds 3*x to f
-f.sub_monomial_inplace(&one, 0 /*monmoial*/);   // Subtracts 1 from f
-```
-
-### Slicing
-Extract specific components:
-
-```rust
-let even = f.even();  // Polynomial of even-indexed terms
-let odd = f.odd();    // Polynomial of odd-indexed terms
-let arbitrary_slice = f.slice(offset, stride, size);
-```
-
-### Evaluate
-Evaluate the polynoomial:
-
-```rust
-let x = rand();  // Random field element
-let f_x = f.eval(&x);  // Evaluate f at x
-
-// Evaluate on a predefined domain
-let domain = [one, two, three];
-let mut host_evals = vec![ScalarField::zero(); domain.len()];
-f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
-```
-
-### Read coefficients
-Read or copy polynomial coefficients for further processing:
-
-```rust
-let x_squared_coeff = f.get_coeff(2);  // Coefficient of x^2
-
-// Copy coefficients to a device-specific memory space
-let mut device_mem = DeviceVec::<Field>::cuda_malloc(coeffs.len()).unwrap();
-f.copy_coeffs(0, &mut device_mem[..]);
-```
-
-### Polynomial Degree
-Determine the highest power of the variable with a non-zero coefficient:
-
-```rust
-let deg = f.degree();  // Degree of the polynomial
-```
-
-### Memory Management: Views (rust slices)
-Rust enforces correct usage of views at compile time, eliminating the need for runtime checks:
-
-```rust
-let mut f = Poly::from_coeffs(HostSlice::from_slice(&coeffs), size);
-
-// Obtain a mutable slice of coefficients as a DeviceSlice
-let coeffs_slice_dev = f.coeffs_mut_slice();
-
-// Operations on f are restricted here due to mutable borrow of coeffs_slice_dev
-
-// Compute evaluations or perform other operations directly using the slice
-// example: evaluate f on a coset of roots-of-unity. Computing from GPU to HOST/GPU
-let mut config: NTTConfig<'_, F> = NTTConfig::default();
-config.coset_gen = /*some coset gen*/;
-let mut coset_evals = vec![F::zero(); coeffs_slice_dev.len()];
-ntt(
-    coeffs_slice_dev,
-    NTTDir::kForward,
-    &config,
-    HostSlice::from_mut_slice(&mut coset_evals),
-)
-.unwrap();
-
-// now can f can be borrowed once again
-```
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,217 +0,0 @@
-# Vector Operations API
-
-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. 
-
-
-## Supported curves
-
-Vector operations are supported on the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Examples
-
-### Addition of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{add_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-add_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Subtraction of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{sub_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-sub_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Multiplication of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{mul_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let ones: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::one(); test_size]);
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
-```
-
-
-## Vector Operations Configuration
-
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
-
-### `VecOpsConfig`
-
-Defines configuration parameters for vector operations.
-
-```rust
-pub struct VecOpsConfig<'a> {
-    pub ctx: DeviceContext<'a>,
-    is_a_on_device: bool,
-    is_b_on_device: bool,
-    is_result_on_device: bool,
-    pub is_async: bool,
-}
-```
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context for the operation, including the device ID and memory pool.
- **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
- **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
- **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
- **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.
-
-### Default Configuration
-
-`VecOpsConfig` can be initialized with default settings tailored for a specific device:
-
-```
-let cfg = VecOpsConfig::default();
-```
-
-These are the default settings.
-
-```rust
-impl<'a> Default for VecOpsConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
-}
-
-impl<'a> VecOpsConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        VecOpsConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            is_a_on_device: false,
-            is_b_on_device: false,
-            is_result_on_device: false,
-            is_async: false,
-        }
-    }
-}
-```
-
-## Vector Operations
-
-Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
-
-### `VecOps` Trait
-
-```rust
-pub trait VecOps<F> {
-    fn add(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn sub(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn mul(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-}
-```
-
-#### Methods
-
-All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
-
- **`add`**: Computes the element-wise sum of two vectors.
- **`sub`**: Computes the element-wise difference between two vectors.
- **`mul`**: Performs element-wise multiplication of two vectors.
-
-
-## MatrixTranspose API Documentation
-
-This section describes the functionality of the `TransposeMatrix` function used for matrix transposition.
-
-The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
-
-### Function
-
-```rust
-pub fn transpose_matrix<F>(
-    input: &HostOrDeviceSlice<F>,
-    row_size: u32,
-    column_size: u32,
-    output: &mut HostOrDeviceSlice<F>,
-    ctx: &DeviceContext,
-    on_device: bool,
-    is_async: bool,
-) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: VecOps<F>
-```
-
-### Parameters
-
- **`input`**: A slice representing the input matrix. The slice can be stored on either the host or the device.
- **`row_size`**: The number of rows in the input matrix.
- **`column_size`**: The number of columns in the input matrix.
- **`output`**: A mutable slice to store the transposed matrix. The slice can be stored on either the host or the device.
- **`ctx`**: A reference to the `DeviceContext`, which provides information about the device where the operation will be performed.
- **`on_device`**: A boolean flag indicating whether the inputs and outputs are on the device. 
- **`is_async`**: A boolean flag indicating whether the operation should be performed asynchronously. 
-
-### Return Value
-
-`Ok(())` if the operation is successful, or an `IcicleResult` error otherwise.
-
-### Example
-
-```rust
-use icicle::HostOrDeviceSlice;
-use icicle::DeviceContext;
-use icicle::FieldImpl;
-use icicle::VecOps;
-
-let input: HostOrDeviceSlice<i32> = // ...;
-let mut output: HostOrDeviceSlice<i32> = // ...;
-let ctx: DeviceContext = // ...;
-
-transpose_matrix(&input, 5, 4, &mut output, &ctx, true, false)
-    .expect("Failed to transpose matrix");
-```
-
-
-The function takes a matrix represented as a 1D slice, transposes it, and stores the result in another 1D slice. The input and output slices can be stored on either the host or the device, and the operation can be performed synchronously or asynchronously.
-
-The function is generic and can work with any type `F` that implements the `FieldImpl` trait. The `<F as FieldImpl>::Config` type must also implement the `VecOps<F>` trait, which provides the `transpose` method used to perform the actual transposition.
-
-The function returns an `IcicleResult<()>`, indicating whether the operation was successful or not.
--- a/docs/docs/icicle/supporting-additional-curves.md
+++ b/docs/docs/icicle/supporting-additional-curves.md
@@ -0,0 +1,86 @@
+# Supporting Additional Curves
+
+We understand the need for ZK developers to use different curves, some common some more exotic. For this reason we designed ICICLE to allow developers to add any curve they desire.
+
+## ICICLE Core
+
+ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.
+
+To add support a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
+
+We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.
+
+```
+...
+
+#define BN254     1
+#define BLS12_381 2
+#define BLS12_377 3
+#define BW6_761   4
+#define GRUMPKIN  5
+#define <curve_name> 6
+
+...
+```
+
+Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/icicle/curves/curve_config.cuh#L16-L29) accordingly.
+
+Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.
+
+```
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;<curve_name>)
+```
+
+## Bindings
+
+In order to support a new curves in the binding libraries you first must support it in ICICLE core.
+
+### Rust
+
+Create a new folder named `icicle-<curve_name>` under the [rust wrappers folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves). Your new directory should look like this.
+
+```
+└── rust
+    ├── icicle-curves
+        ├── icicle-<curve_name>
+    │   │   ├── Cargo.toml
+    │   │   ├── build.rs
+    │   │   └── src/
+    │   │       ├── curve.rs
+    │   │       ├── lib.rs
+    │   │       ├── msm/
+    │   │       │   └── mod.rs
+    │   │       └── ntt/
+    │   │           └── mod.rs
+```
+
+Lets look at [`ntt/mod.rs`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs) for example.
+
+```
+...
+
+extern "C" {
+    #[link_name = "bn254NTTCuda"]
+    fn ntt_cuda<'a>(
+        input: *const ScalarField,
+        size: usize,
+        is_inverse: bool,
+        config: &NTTConfig<'a, ScalarField>,
+        output: *mut ScalarField,
+    ) -> CudaError;
+
+    #[link_name = "bn254DefaultNTTConfig"]
+    fn default_ntt_config() -> NTTConfig<'static, ScalarField>;
+
+    #[link_name = "bn254InitializeDomain"]
+    fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
+}
+
+...
+```
+
+Here you would need to replace `bn254NTTCuda` with `<curve_name>NTTCuda`. Most of these changes are pretty straight forward. One thing you should pay attention to is limb sizes as these change for different curves. For example `BN254` [has limb size of 8](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs#L15) but for your curve this may be different.
+
+### Golang
+
+Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -9,7 +9,7 @@ const config = {
  title: 'Ingonyama Developer Documentation',
  tagline: 'Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications.',
  url: 'https://dev.ingonyama.com/',
-  baseUrl: '/',
+  baseUrl: '/icicle/',
  onBrokenLinks: 'throw',
  onBrokenMarkdownLinks: 'warn',
  favicon: 'img/logo.png',
@@ -29,13 +29,13 @@ const config = {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          sidebarPath: require.resolve('./sidebars.js'),
-          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
        },
        blog: {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          showReadingTime: true,
-          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
        },
        pages: {},
        theme: {
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -21,55 +21,13 @@ module.exports = {
        },
        {
          type: "doc",
-          label: "ICICLE Core",
-          id: "icicle/core",
+          label: "ICICLE Provers",
+          id: "icicle/integrations"
        },
        {
-          type: "category",
+          type: "doc",
          label: "Golang bindings",
-          link: {
-            type: `doc`,
-            id: "icicle/golang-bindings",
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "category",
-              label: "MSM",
-              link: {
-                type: `doc`,
-                id: "icicle/golang-bindings/msm",
-              },
-              collapsed: true,
-              items: [
-                {
-                  type: "doc",
-                  label: "MSM pre computation",
-                  id: "icicle/golang-bindings/msm-pre-computation",
-                }
-              ]
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/golang-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "EC-NTT",
-              id: "icicle/golang-bindings/ecntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/golang-bindings/vec-ops",
-            },
-            {
-              type: "doc",
-              label: "Multi GPU Support",
-              id: "icicle/golang-bindings/multi-gpu",
-            },
-          ]
+          id: "icicle/golang-bindings",
        },
        {
          type: "category",
@@ -80,48 +38,12 @@ module.exports = {
          },
          collapsed: true,
          items: [
-            {
-              type: "category",
-              label: "MSM",
-              link: {
-                type: `doc`,
-                id: "icicle/rust-bindings/msm",
-              },
-              collapsed: true,
-              items: [
-                {
-                  type: "doc",
-                  label: "MSM pre computation",
-                  id: "icicle/rust-bindings/msm-pre-computation",
-                }
-              ]
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/rust-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "EC-NTT",
-              id: "icicle/rust-bindings/ecntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/rust-bindings/vec-ops",
-            },
            {
              type: "doc",
              label: "Multi GPU Support",
              id: "icicle/rust-bindings/multi-gpu",
-            },
-            {
-              type: "doc",
-              label: "Polynomials",
-              id: "icicle/rust-bindings/polynomials",
-            },
-          ],
+            }
+          ]
        },
        {
          type: "category",
@@ -137,23 +59,18 @@ module.exports = {
              label: "MSM",
              id: "icicle/primitives/msm",
            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/primitives/ntt",
-            },
            {
              type: "doc",
              label: "Poseidon Hash",
              id: "icicle/primitives/poseidon",
            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/primitives/ntt",
+            }
          ],
        },
-        {
-          type: "doc",
-          label: "Polynomials",
-          id: "icicle/polynomials/overview",
-        },
        {
          type: "doc",
          label: "Multi GPU Support",
@@ -161,13 +78,13 @@ module.exports = {
        },
        {
          type: "doc",
-          label: "Google Colab Instructions",
-          id: "icicle/colab-instructions",
+          label: "Supporting additional curves",
+          id: "icicle/supporting-additional-curves",
        },
        {
          type: "doc",
-          label: "ICICLE Provers",
-          id: "icicle/integrations"
+          label: "Google Colab Instructions",
+          id: "icicle/colab-instructions",
        },
      ]
    },
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -8,16 +8,18 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(example LANGUAGES CUDA CXX)
+project(icicle LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-
+# change the path to your Icicle location
+include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -3,13 +3,7 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -2,8 +2,11 @@
 #include <iostream>
 #include <iomanip>

-#include "api/bn254.h"
-using namespace bn254;
+#define G2_DEFINED
+#define CURVE_ID 1
+// include MSM template
+#include "appUtils/msm/msm.cu"
+using namespace curve_config;

 int main(int argc, char* argv[])
 {
@@ -21,10 +24,11 @@ int main(int argc, char* argv[])
  scalar_t* scalars = new scalar_t[N];
  affine_t* points = new affine_t[N];
  projective_t result;
-  scalar_t::rand_host_many(scalars, N);
-  projective_t::rand_host_many_affine(points, N);
+  scalar_t::RandHostMany(scalars, N);
+  projective_t::RandHostManyAffine(points, N);

  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
+  // auto config = msm::DefaultMSMConfig();
  device_context::DeviceContext ctx = device_context::get_default_device_context();
  msm::MSMConfig config = {
    ctx,   // ctx
@@ -45,9 +49,28 @@ int main(int argc, char* argv[])
  config.batch_size = batch_size;
  
  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
+  // Create two events to time the MSM kernel
  cudaStream_t stream = config.ctx.stream;
+  cudaEvent_t start, stop;
+  float time;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
  // Execute the MSM kernel
-  bn254_msm_cuda(scalars, points, msm_size, config, &result);
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, msm_size, config, &result);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
+  // Print the result
  std::cout << projective_t::to_affine(result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
@@ -66,9 +89,24 @@ int main(int argc, char* argv[])
  config.are_points_on_device = true;

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
+  // Create two events to time the MSM kernel
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  // Record the start event on the stream
+  cudaEventRecord(start, stream);
  // Execute the MSM kernel
-  bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);
-
+  msm::MSM<scalar_t, affine_t, projective_t>(scalars_d, points_d, msm_size, config, result_d);
+  // Record the stop event on the stream
+  cudaEventRecord(stop, stream);
+  // Wait for the stop event to complete
+  cudaEventSynchronize(stop);
+  // Calculate the elapsed time between the start and stop events
+  cudaEventElapsedTime(&time, start, stop);
+  // Destroy the events
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // Print the elapsed time
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
  // Copy the result back to the host
  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
  // Print the result
@@ -85,14 +123,23 @@ int main(int argc, char* argv[])
  std::cout << "Generating random inputs on-host" << std::endl;
  // use the same scalars
  g2_affine_t* g2_points = new g2_affine_t[N];
-  g2_projective_t::rand_host_many_affine(g2_points, N);
+  g2_projective_t::RandHostManyAffine(g2_points, N);

  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
  config.are_results_on_device = false;
  config.are_scalars_on_device = false;
  config.are_points_on_device = false;
  g2_projective_t g2_result;
-  bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars, g2_points, msm_size, config, &g2_result);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
@@ -110,7 +157,16 @@ int main(int argc, char* argv[])
  config.are_points_on_device = true;

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
-  bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, stream);
+  msm::MSM<scalar_t, g2_affine_t, g2_projective_t>(scalars_d, g2_points_d, msm_size, config, g2_result_d);
+  cudaEventRecord(stop, stream);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&time, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  std::cout << "Kernel runtime: " << std::fixed << std::setprecision(3) << time * 1e-3 << " sec." << std::endl;
  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

--- a/examples/c++/msm/run.sh
+++ b/examples/c++/msm/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example/example
+./build/example
--- a/examples/c++/multi-gpu-poseidon/CMakeLists.txt
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -14,13 +14,11 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
 # change the path to your Icicle location
+include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multi-gpu-poseidon/compile.sh
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -3,13 +3,7 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -1,13 +1,16 @@
 #include <iostream>
 #include <thread>
 #include <chrono>
+
 #include <nvml.h>

-#include "api/bn254.h"
-#include "gpu-utils/error_handler.cuh"
+// select the curve
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"

 using namespace poseidon;
-using namespace bn254;
+using namespace curve_config;

 void checkCudaError(cudaError_t error) {
    if (error != cudaSuccess) {
@@ -36,7 +39,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition,
        false, // loop_state
        false, // is_async
        };
-    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
    checkCudaError(err);
 }

@@ -106,13 +109,13 @@ int main() {
    CHECK_ALLOC(column_hash1);

    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
    if (err_result != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
        return; 
    }
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);

    std::cout << "Parallel execution of Poseidon threads" << std::endl;
    START_TIMER(parallel);
--- a/examples/c++/multi-gpu-poseidon/run.sh
+++ b/examples/c++/multi-gpu-poseidon/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example/example
+./build/example
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -8,17 +8,17 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(example LANGUAGES CUDA CXX)
+project(icicle LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -3,13 +3,7 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -3,21 +3,22 @@
 #include <chrono>
 #include <nvml.h>

-#include "api/bn254.h"
-#include "vec_ops/vec_ops.cuh"
+#define CURVE_ID 1
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/vec_ops.cu"

-using namespace vec_ops;
-using namespace bn254;
+using namespace curve_config;

 typedef scalar_t T;

 int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
-  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
+  vec_ops::VecOpsConfig<scalar_t> config = vec_ops::DefaultVecOpsConfig<scalar_t>();
  config.is_a_on_device = true;
  config.is_b_on_device = true;
  config.is_result_on_device = true;
-  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err =  vec_ops::Mul<T>(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -62,8 +63,8 @@ int main(int argc, char** argv)
  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
  std::cout << "Initializing vectors with random data" << std::endl;
-  T::rand_host_many(host_in1, vector_size);
-  T::rand_host_many(host_in2, vector_size);
+  T::RandHostMany(host_in1, vector_size);
+  T::RandHostMany(host_in2, vector_size);
  // device data
  device_context::DeviceContext ctx = device_context::get_default_device_context();
  T* device_in1;
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example/example
+./build/example
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -8,16 +8,19 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(example LANGUAGES CUDA CXX)
+project(icicle LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-
+# change the path to your Icicle location
+include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
@@ -3,13 +3,9 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build

-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle

-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -1,11 +1,12 @@
 #include <chrono>
 #include <iostream>

+// select the curve
+#define CURVE_ID 1
 // include NTT template
-
-#include "curves/params/bn254.cuh"
-#include "api/bn254.h"
-using namespace bn254;
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/ntt/kernel_ntt.cu"
+using namespace curve_config;
 using namespace ntt;

 // Operate on scalars
@@ -85,14 +86,14 @@ int main(int argc, char* argv[])
  std::cout << "Running NTT with on-host data" << std::endl;
  // Create a device context
  auto ctx = device_context::get_default_device_context();
-  S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
-  bn254_initialize_domain(&basic_root, ctx, true);
+  const S basic_root = S::omega(log_ntt_size /*NTT_LOG_SIZE*/);
+  InitDomain(basic_root, ctx);
  // Create an NTTConfig instance
-  NTTConfig<S> config = default_ntt_config<S>();
+  NTTConfig<S> config = DefaultNTTConfig<S>();
  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
  config.batch_size = nof_ntts;
  START_TIMER(MixedRadix);
-  cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
+  cudaError_t err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(MixedRadix, "MixedRadix NTT");
  
  std::cout << "Validating output" << std::endl;
@@ -100,7 +101,7 @@ int main(int argc, char* argv[])

  config.ntt_algorithm = NttAlgorithm::Radix2; 
  START_TIMER(Radix2);
-  err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
+  err = NTT<S, E>(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(Radix2, "Radix2 NTT");

  std::cout << "Validating output" << std::endl;
--- a/examples/c++/ntt/run.sh
+++ b/examples/c++/ntt/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example/example
+./build/example
--- a/examples/c++/pedersen-commitment/CMakeLists.txt
+++ b/examples/c++/pedersen-commitment/CMakeLists.txt
@@ -1,26 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-add_executable(
-  example
-  example.cu
-)
-
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/pedersen-commitment/README.md
+++ b/examples/c++/pedersen-commitment/README.md
@@ -1,33 +0,0 @@
-# ICICLE example: Pedersen Commitment
-
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
-## Key-Takeaway
-
-A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
-
-Pedersen commitment is based on Multi-Scalar Multiplication [MSM](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
-`ICICLE` provides CUDA C++ support for [MSM](https://dev.ingonyama.com/icicle/primitives/msm). 
-An example of MSM is [here](../msm/README.md).
-
-## Running the example
-
- `cd` to your example directory
- compile with  `./compile.sh`
- run with `./run.sh`
-
-## Concise Explanation
-
-We recommend this simple [explanation](https://www.rareskills.io/post/pedersen-commitment).
-
-The original paper: T. P. Pedersen, "Non-Interactive and Information-Theoretic Secure Verifiable Secret Sharing," in Advances in Cryptology — CRYPTO ’91, Lecture Notes in Computer Science, vol 576. Springer, Berlin, Heidelberg.
-
-## What's in the example
-
-1. Define the curve and the size of commitment vector
-2. Use public random seed to transparently generate points on the elliptic curve without known discrete logarithm
-3. Generate (random) commitment vector and salt (a.k.a blinding factor)
-4. Configure and execute MSM using on-host data
-5. Output commitment as elliptic point
--- a/examples/c++/pedersen-commitment/compile.sh
+++ b/examples/c++/pedersen-commitment/compile.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -1,159 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include <chrono>
-#include <cassert>
-#include <nvml.h>
-
-#include "api/bn254.h"
-#include "msm/msm.cuh"
-using namespace bn254;
-
-typedef point_field_t T;
-
-// modular power
-T modPow(T base, T exp) {
-  T r = T::one();
-  T b = base;
-  T e = exp;
-  while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
-  }
-  return r;
-}
-
-// Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
-
-// modular square root adapted from:
-// https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
-  if (a == T::zero()) {
-    *result = T::zero();
-    return true;
-  }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
-    return false; // solution does not exist
-  }
-  // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
-  T m;
-  while (T::is_even(s)) {
-    r = r + T::one();
-    s = T::div2(s); //s /= 2;
-  }
-  // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
-    n = n + T::one();
-  }
-  T x = modPow(a, T::div2(s + T::one()));
-  T b = modPow(a, s);
-  T g = modPow(n, s);
-  for (;; r = m) {
-    T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
-      *result = x;
-      return true;
-    }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
-  }
-}
-
-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
-  T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
-    x = x + T::one();
-  };
-  T y;
-  bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
-  point->x = x;
-  point->y = y;
-}
-
-static int seed = 0;
-static HOST_INLINE T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
-    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
-
-using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-int main(int argc, char** argv)
-{
-  const unsigned N = pow(2, 10);
-  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
-  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
-  seed = 1234;
-  std::cout << "Using seed: " << seed << std::endl;
-  std::cout << "Generating random field values" << std::endl;
-  START_TIMER(gen);
-  
-  for (unsigned i = 0; i < N; i++) {
-    xs[i] = rand_host_seed();
-  }
-  END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
-  // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
-  std::cout << "Generating point about random field values" << std::endl;
-  START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
-    point_near_x(xs[i], &points[i]);
-  }
-  END_TIMER(points, "Time to generate points");
-  
-  std::cout << "Generating commitment vector" << std::endl;
-  projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
-  scalar_t::rand_host_many(scalars, N);
-
-  std::cout << "Generating salt" << std::endl;
-  scalars[N] = scalar_t::rand_host();
-
-  std::cout << "Executing MSM" << std::endl;
-  auto config = msm::default_msm_config();
-  START_TIMER(msm);
-  bn254_msm_cuda(scalars, points, N+1, config, &result);
-  END_TIMER(msm, "Time to execute MSM");
-
-  std::cout << "Computed commitment: " << result << std::endl;
-
-  std::cout << "Cleaning up..." << std::endl;
-  delete[] xs;
-  delete[] scalars;
-  delete[] points;
-  return 0;
-}
--- a/examples/c++/pedersen-commitment/run.sh
+++ b/examples/c++/pedersen-commitment/run.sh
@@ -1,2 +0,0 @@
-#!/bin/bash
-./build/example/example
--- a/examples/c++/polynomial_multiplication/CMakeLists.txt
+++ b/examples/c++/polynomial_multiplication/CMakeLists.txt
@@ -8,7 +8,7 @@ if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
 else()
    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
 endif ()
-project(example LANGUAGES CUDA CXX)
+project(icicle LANGUAGES CUDA CXX)

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
@@ -20,8 +20,7 @@ add_executable(
  example.cu
 )

-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/polynomial_multiplication/compile.sh
+++ b/examples/c++/polynomial_multiplication/compile.sh
@@ -3,13 +3,9 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build

-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle

-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -1,14 +1,18 @@
+#define CURVE_ID BLS12_381
+
 #include <chrono>
 #include <iostream>
 #include <vector>
+
+#include "curves/curve_config.cuh"
+#include "appUtils/ntt/ntt.cu"
+#include "appUtils/ntt/kernel_ntt.cu"
+#include "utils/vec_ops.cu"
+#include "utils/error_handler.cuh"
 #include <memory>

-#include "api/bn254.h"
-#include "gpu-utils/error_handler.cuh"
-
-using namespace bn254;
-typedef scalar_t test_scalar;
-typedef scalar_t test_data;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_data;

 void random_samples(test_data* res, uint32_t count)
 {
@@ -41,7 +45,7 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context

  // init domain
-  auto ntt_config = ntt::default_ntt_config<test_scalar>();
+  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
  const bool is_radix2_alg = (argc > 1) ? atoi(argv[1]) : false;
  ntt_config.ntt_algorithm = is_radix2_alg ? ntt::NttAlgorithm::Radix2 : ntt::NttAlgorithm::MixedRadix;

@@ -51,8 +55,8 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaEventCreate(&start));
  CHK_IF_RETURN(cudaEventCreate(&stop));

-  test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  bn254_initialize_domain(&basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);
+  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  ntt::InitDomain(basic_root, ntt_config.ctx, true /*=fast_twidddles_mode*/);

  // (1) cpu allocation
  auto CpuA = std::make_unique<test_data[]>(NTT_SIZE);
@@ -75,25 +79,27 @@ int main(int argc, char** argv)
      ntt_config.are_inputs_on_device = false;
      ntt_config.are_outputs_on_device = true;
      ntt_config.ordering = ntt::Ordering::kNM;
-      CHK_IF_RETURN(bn254_ntt_cuda(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
-      CHK_IF_RETURN(bn254_ntt_cuda(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));
+      CHK_IF_RETURN(ntt::NTT(CpuA.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuA));
+      CHK_IF_RETURN(ntt::NTT(CpuB.get(), NTT_SIZE, ntt::NTTDir::kForward, ntt_config, GpuB));

      // (4) multiply A,B
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
-      vec_ops::VecOpsConfig config{
+      vec_ops::VecOpsConfig<test_data> config {
        ntt_config.ctx,
        true,  // is_a_on_device
        true,  // is_b_on_device
        true,  // is_result_on_device
+        false, // is_montgomery
        false  // is_async
      };
-      CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));
+      CHK_IF_RETURN(
+        vec_ops::Mul(GpuA, GpuB, NTT_SIZE, config, MulGpu));

      // (5) INTT (in place)
      ntt_config.are_inputs_on_device = true;
      ntt_config.are_outputs_on_device = true;
      ntt_config.ordering = ntt::Ordering::kMN;
-      CHK_IF_RETURN(bn254_ntt_cuda(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));
+      CHK_IF_RETURN(ntt::NTT(MulGpu, NTT_SIZE, ntt::NTTDir::kInverse, ntt_config, MulGpu));

      CHK_IF_RETURN(cudaFreeAsync(GpuA, ntt_config.ctx.stream));
      CHK_IF_RETURN(cudaFreeAsync(GpuB, ntt_config.ctx.stream));
@@ -112,7 +118,6 @@ int main(int argc, char** argv)
  benchmark(false); // warmup
  benchmark(true, 20);

-  bn254_release_domain(ntt_config.ctx);
  CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));

  return 0;
--- a/examples/c++/polynomial_multiplication/run.sh
+++ b/examples/c++/polynomial_multiplication/run.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
-./build/example/example 1 # radix2
-./build/example/example 0 # mixed-radix
+./build/example 1 # radix2
+./build/example 0 # mixed-radix
--- a/examples/c++/poseidon/CMakeLists.txt
+++ b/examples/c++/poseidon/CMakeLists.txt
@@ -13,11 +13,13 @@ project(icicle LANGUAGES CUDA CXX)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS_RELEASE "")
 set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-
+# change the path to your Icicle location
+include_directories("../../../icicle")
 add_executable(
  example
  example.cu
 )
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/poseidon/compile.sh
+++ b/examples/c++/poseidon/compile.sh
@@ -3,13 +3,7 @@
 # Exit immediately on error
 set -e

-mkdir -p build/example
-mkdir -p build/icicle
-
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle
-
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -2,12 +2,14 @@
 #include <fstream>
 #include <iostream>

-#include "api/bn254.h"
-#include "curves/params/bn254.cuh"
+// select the curve
+#define CURVE_ID 2
+// include Poseidon template
+#include "appUtils/poseidon/poseidon.cu"
 using namespace poseidon;
-using namespace bn254;
+using namespace curve_config;

-device_context::DeviceContext ctx = device_context::get_default_device_context();
+device_context::DeviceContext ctx= device_context::get_default_device_context();

 // location of a tree node in the array for a given level and offset
 inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level) - 1 + offset; }
@@ -19,7 +21,8 @@ void build_tree(
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    poseidon_hash<scalar_t, 2+1>(
+      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, *constants, config);
  }
 }

@@ -82,7 +85,7 @@ uint32_t validate_proof(
      hashes_in[1] = level_hash;
    }
    // next level hash
-    bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
+    poseidon_hash<scalar_t, 2+1>(hashes_in, hash_out, 1, *constants, config);
    level_hash = hash_out[0];
  }
  return proof_hash[0] == level_hash;
@@ -113,14 +116,14 @@ int main(int argc, char* argv[])
  }
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
  PoseidonConstants<scalar_t> constants;
-  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity+1); 
-  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
+  init_optimized_poseidon_constants<scalar_t>(data_arity, ctx, &constants);
+  PoseidonConfig config = default_poseidon_config<scalar_t>(data_arity+1); 
+  poseidon_hash<curve_config::scalar_t, data_arity+1>(data, &tree[tree_index(leaf_level, 0)], tree_width, constants, config);

  std::cout << "3. Building Merkle tree" << std::endl;
  PoseidonConstants<scalar_t> tree_constants;
-  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
+  init_optimized_poseidon_constants<scalar_t>(tree_arity, ctx, &tree_constants);
+  PoseidonConfig tree_config = default_poseidon_config<scalar_t>(tree_arity+1);
  build_tree(tree_height, tree, &tree_constants, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
--- a/examples/c++/poseidon/run.sh
+++ b/examples/c++/poseidon/run.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./build/example/example
+./build/example
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -8,11 +8,12 @@ icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
 icicle-core = { path = "../../../wrappers/rust/icicle-core" }
 icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254", features = ["g2"] }
 icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-377" }
-ark-bn254 = { version = "0.4.0", optional = true }
-ark-bls12-377 = { version = "0.4.0", optional = true }
-ark-ec = { version = "0.4.0", optional = true }
+ark-bn254 = { version = "0.4.0", optional = true}
+ark-bls12-377 = { version = "0.4.0", optional = true}
+ark-ec = { version = "0.4.0", optional = true}
 clap = { version = "<=4.4.12", features = ["derive"] }

 [features]
 arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
 profile = []
+g2 = []
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -4,10 +4,7 @@ use icicle_bls12_377::curve::{
    CurveCfg as BLS12377CurveCfg, G1Projective as BLS12377G1Projective, ScalarCfg as BLS12377ScalarCfg,
 };

-use icicle_cuda_runtime::{
-    memory::{DeviceVec, HostSlice},
-    stream::CudaStream,
-};
+use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};

 use icicle_core::{curve::Curve, msm, traits::GenerateRandom};

@@ -60,18 +57,18 @@ fn main() {
            log_size, size
        );
        // Setting Bn254 points and scalars
-        let points = HostSlice::from_slice(&upper_points[..size]);
-        let g2_points = HostSlice::from_slice(&g2_upper_points[..size]);
-        let scalars = HostSlice::from_slice(&upper_scalars[..size]);
+        let points = HostOrDeviceSlice::Host(upper_points[..size].to_vec());
+        let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+        let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());

        // Setting bls12377 points and scalars
        // let points_bls12377 = &upper_points_bls12377[..size];
-        let points_bls12377 = HostSlice::from_slice(&upper_points_bls12377[..size]); //  &upper_points_bls12377[..size];
-        let scalars_bls12377 = HostSlice::from_slice(&upper_scalars_bls12377[..size]);
+        let points_bls12377 = HostOrDeviceSlice::Host(upper_points_bls12377[..size].to_vec()); //  &upper_points_bls12377[..size];
+        let scalars_bls12377 = HostOrDeviceSlice::Host(upper_scalars_bls12377[..size].to_vec());

        println!("Configuring bn254 MSM...");
-        let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
-        let mut g2_msm_results = DeviceVec::<G2Projective>::cuda_malloc(1).unwrap();
+        let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+        let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream = CudaStream::create().unwrap();
        let g2_stream = CudaStream::create().unwrap();
        let mut cfg = msm::MSMConfig::default();
@@ -85,7 +82,8 @@ fn main() {
        g2_cfg.is_async = true;

        println!("Configuring bls12377 MSM...");
-        let mut msm_results_bls12377 = DeviceVec::<BLS12377G1Projective>::cuda_malloc(1).unwrap();
+        let mut msm_results_bls12377: HostOrDeviceSlice<'_, BLS12377G1Projective> =
+            HostOrDeviceSlice::cuda_malloc(1).unwrap();
        let stream_bls12377 = CudaStream::create().unwrap();
        let mut cfg_bls12377 = msm::MSMConfig::default();
        cfg_bls12377
@@ -96,7 +94,7 @@ fn main() {
        println!("Executing bn254 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
-        msm::msm(scalars, points, &cfg, &mut msm_results[..]).unwrap();
+        msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
        #[cfg(feature = "profile")]
        println!(
            "ICICLE BN254 MSM on size 2^{log_size} took: {} ms",
@@ -104,16 +102,16 @@ fn main() {
                .elapsed()
                .as_millis()
        );
-        msm::msm(scalars, g2_points, &g2_cfg, &mut g2_msm_results[..]).unwrap();
+        msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();

        println!("Executing bls12377 MSM on device...");
        #[cfg(feature = "profile")]
        let start = Instant::now();
        msm::msm(
-            scalars_bls12377,
-            points_bls12377,
+            &scalars_bls12377,
+            &points_bls12377,
            &cfg_bls12377,
-            &mut msm_results_bls12377[..],
+            &mut msm_results_bls12377,
        )
        .unwrap();
        #[cfg(feature = "profile")]
@@ -136,10 +134,10 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results
-            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
+            .copy_to_host(&mut msm_host_result[..])
            .unwrap();
        g2_msm_results
-            .copy_to_host(HostSlice::from_mut_slice(&mut g2_msm_host_result[..]))
+            .copy_to_host(&mut g2_msm_host_result[..])
            .unwrap();
        println!("bn254 result: {:#?}", msm_host_result);
        println!("G2 bn254 result: {:#?}", g2_msm_host_result);
@@ -148,7 +146,7 @@ fn main() {
            .synchronize()
            .unwrap();
        msm_results_bls12377
-            .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result_bls12377[..]))
+            .copy_to_host(&mut msm_host_result_bls12377[..])
            .unwrap();
        println!("bls12377 result: {:#?}", msm_host_result_bls12377);

@@ -156,19 +154,23 @@ fn main() {
        {
            println!("Checking against arkworks...");
            let ark_points: Vec<Bn254G1Affine> = points
+                .as_slice()
                .iter()
                .map(|&point| point.to_ark())
                .collect();
            let ark_scalars: Vec<Bn254Fr> = scalars
+                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();

            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377
+                .as_slice()
                .iter()
                .map(|point| point.to_ark())
                .collect();
            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
+                .as_slice()
                .iter()
                .map(|scalar| scalar.to_ark())
                .collect();
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -2,14 +2,10 @@ use icicle_bn254::curve::{ScalarCfg, ScalarField};

 use icicle_bls12_377::curve::{ScalarCfg as BLS12377ScalarCfg, ScalarField as BLS12377ScalarField};

-use icicle_cuda_runtime::{
-    device_context::DeviceContext,
-    memory::{DeviceVec, HostSlice},
-    stream::CudaStream,
-};
+use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};

 use icicle_core::{
-    ntt::{self, initialize_domain},
+    ntt::{self, NTT},
    traits::{FieldImpl, GenerateRandom},
 };

@@ -45,13 +41,14 @@ fn main() {
    );
    // Setting Bn254 points and scalars
    println!("Generating random inputs on host for bn254...");
-    let scalars = ScalarCfg::generate_random(size);
-    let mut ntt_results = DeviceVec::<ScalarField>::cuda_malloc(size).unwrap();
+    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
+    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).unwrap();

    // Setting bls12377 points and scalars
    println!("Generating random inputs on host for bls12377...");
-    let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
-    let mut ntt_results_bls12377 = DeviceVec::<BLS12377ScalarField>::cuda_malloc(size).unwrap();
+    let scalars_bls12377 = HostOrDeviceSlice::Host(BLS12377ScalarCfg::generate_random(size));
+    let mut ntt_results_bls12377: HostOrDeviceSlice<'_, BLS12377ScalarField> =
+        HostOrDeviceSlice::cuda_malloc(size).unwrap();

    println!("Setting up bn254 Domain...");
    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
@@ -60,11 +57,11 @@ fn main() {
    )
    .unwrap();
    let ctx = DeviceContext::default();
-    initialize_domain(ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();

    println!("Configuring bn254 NTT...");
    let stream = CudaStream::create().unwrap();
-    let mut cfg = ntt::NTTConfig::<'_, ScalarField>::default();
+    let mut cfg = ntt::NTTConfig::default();
    cfg.ctx
        .stream = &stream;
    cfg.is_async = true;
@@ -76,11 +73,11 @@ fn main() {
    )
    .unwrap();
    // reusing ctx from above
-    initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx, true).unwrap();
+    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();

    println!("Configuring bls12377 NTT...");
    let stream_bls12377 = CudaStream::create().unwrap();
-    let mut cfg_bls12377 = ntt::NTTConfig::<'_, BLS12377ScalarField>::default();
+    let mut cfg_bls12377 = ntt::NTTConfig::default();
    cfg_bls12377
        .ctx
        .stream = &stream_bls12377;
@@ -89,13 +86,7 @@ fn main() {
    println!("Executing bn254 NTT on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    ntt::ntt(
-        HostSlice::from_slice(&scalars),
-        ntt::NTTDir::kForward,
-        &cfg,
-        &mut ntt_results[..],
-    )
-    .unwrap();
+    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).unwrap();
    #[cfg(feature = "profile")]
    println!(
        "ICICLE BN254 NTT on size 2^{log_size} took: {} μs",
@@ -108,10 +99,10 @@ fn main() {
    #[cfg(feature = "profile")]
    let start = Instant::now();
    ntt::ntt(
-        HostSlice::from_slice(&scalars_bls12377),
+        &scalars_bls12377,
        ntt::NTTDir::kForward,
        &cfg_bls12377,
-        &mut ntt_results_bls12377[..],
+        &mut ntt_results_bls12377,
    )
    .unwrap();
    #[cfg(feature = "profile")]
@@ -128,7 +119,7 @@ fn main() {
        .unwrap();
    let mut host_bn254_results = vec![ScalarField::zero(); size];
    ntt_results
-        .copy_to_host(HostSlice::from_mut_slice(&mut host_bn254_results[..]))
+        .copy_to_host(&mut host_bn254_results[..])
        .unwrap();

    stream_bls12377
@@ -136,17 +127,19 @@ fn main() {
        .unwrap();
    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
    ntt_results_bls12377
-        .copy_to_host(HostSlice::from_mut_slice(&mut host_bls12377_results[..]))
+        .copy_to_host(&mut host_bls12377_results[..])
        .unwrap();

    println!("Checking against arkworks...");
    let mut ark_scalars: Vec<Bn254Fr> = scalars
+        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();

    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377
+        .as_slice()
        .iter()
        .map(|scalar| scalar.to_ark())
        .collect();
--- a/examples/rust/polynomials/Cargo.toml
+++ b/examples/rust/polynomials/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "polynomials"
-version = "1.2.0"
-edition = "2018"
-
-[dependencies]
-icicle-cuda-runtime = { path = "../../../wrappers/rust/icicle-cuda-runtime" }
-icicle-core = { path = "../../../wrappers/rust/icicle-core" }
-icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254" }
-icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
-clap = { version = "<=4.4.12", features = ["derive"] }
-
-[features]
-profile = []
--- a/examples/rust/polynomials/src/main.rs
+++ b/examples/rust/polynomials/src/main.rs
@@ -1,101 +0,0 @@
-use icicle_babybear::field::ScalarField as babybearScalar;
-use icicle_babybear::polynomials::DensePolynomial as PolynomialBabyBear;
-use icicle_bn254::curve::ScalarField as bn254Scalar;
-use icicle_bn254::polynomials::DensePolynomial as PolynomialBn254;
-
-use icicle_cuda_runtime::{
-    device_context::DeviceContext,
-    memory::{DeviceVec, HostSlice},
-};
-
-use icicle_core::{
-    ntt::{get_root_of_unity, initialize_domain},
-    polynomials::UnivariatePolynomial,
-    traits::{FieldImpl, GenerateRandom},
-};
-
-#[cfg(feature = "profile")]
-use std::time::Instant;
-
-use clap::Parser;
-
-#[derive(Parser, Debug)]
-struct Args {
-    /// Size of NTT to run (20 for 2^20)
-    #[arg(short, long, default_value_t = 20)]
-    max_ntt_log_size: u8,
-    #[arg(short, long, default_value_t = 15)]
-    poly_log_size: u8,
-}
-
-fn init(max_ntt_size: u64) {
-    // initialize NTT domain for all fields!. Polynomials ops relies on NTT.
-    let rou_bn254: bn254Scalar = get_root_of_unity(max_ntt_size);
-    let ctx = DeviceContext::default();
-    initialize_domain(rou_bn254, &ctx, false /*=fast twiddles mode*/).unwrap();
-
-    let rou_babybear: babybearScalar = get_root_of_unity(max_ntt_size);
-    initialize_domain(rou_babybear, &ctx, false /*=fast twiddles mode*/).unwrap();
-
-    // initialize the cuda backend for polynomials
-    // make sure to initialize it per field
-    PolynomialBn254::init_cuda_backend();
-    PolynomialBabyBear::init_cuda_backend();
-}
-
-fn randomize_poly<P>(size: usize, from_coeffs: bool) -> P
-where
-    P: UnivariatePolynomial,
-    P::Field: FieldImpl,
-    P::FieldConfig: GenerateRandom<P::Field>,
-{
-    let coeffs_or_evals = P::FieldConfig::generate_random(size);
-    let p = if from_coeffs {
-        P::from_coeffs(HostSlice::from_slice(&coeffs_or_evals), size)
-    } else {
-        P::from_rou_evals(HostSlice::from_slice(&coeffs_or_evals), size)
-    };
-    p
-}
-
-fn main() {
-    let args = Args::parse();
-    init(1 << args.max_ntt_log_size);
-
-    // randomize three polynomials f,g,h over bn254 scalar field
-    let poly_size = 1 << args.poly_log_size;
-    let f = randomize_poly::<PolynomialBn254>(poly_size, true /*from random coeffs*/);
-    let g = randomize_poly::<PolynomialBn254>(poly_size / 2, true /*from random coeffs*/);
-    let h = randomize_poly::<PolynomialBn254>(poly_size / 4, false /*from random evaluations on rou*/);
-
-    // randomize two polynomials over babybear field
-    let f_babybear = randomize_poly::<PolynomialBabyBear>(poly_size, true /*from random coeffs*/);
-    let g_babybear = randomize_poly::<PolynomialBabyBear>(poly_size / 2, true /*from random coeffs*/);
-
-    // Arithmetic
-    let t0 = &f + &g;
-    let t1 = &f * &h;
-    let (q, r) = t1.divide(&t0); // computes q,r for t1(x)=q(x)*t0(x)+r(x)
-
-    let _r_babybear = &f_babybear * &g_babybear;
-
-    // check degree
-    let _r_degree = r.degree();
-
-    // evaluate in single domain point
-    let five = bn254Scalar::from_u32(5);
-    let q_at_five = q.eval(&five);
-
-    // evaluate on domain. Note: domain and image can be either Host or Device slice.
-    // in this example domain in on host and evals on device.
-    let host_domain = [five, bn254Scalar::from_u32(30)];
-    let mut device_image = DeviceVec::<bn254Scalar>::cuda_malloc(host_domain.len()).unwrap();
-    t1.eval_on_domain(HostSlice::from_slice(&host_domain), &mut device_image[..]);
-
-    // slicing
-    let o = h.odd();
-    let e = h.even();
-    let fold = &e + &(&o * &q_at_five); // e(x) + o(x)*scalar
-
-    let _coeff = fold.get_coeff(2); // coeff of x^2
-}
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -4,7 +4,7 @@ use icicle_cuda_runtime::device_context::DeviceContext;

 use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
 use icicle_core::traits::FieldImpl;
-use icicle_cuda_runtime::memory::HostSlice;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 #[cfg(feature = "profile")]
 use std::time::Instant;
@@ -25,29 +25,23 @@ fn main() {

    println!("Running Icicle Examples: Rust Poseidon Hash");
    let arity = 2u32;
-    println!(
-        "---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
-        arity
-    );
+    println!("---------------------- Loading optimized Poseidon constants for arity={} ------------------------", arity);
    let ctx = DeviceContext::default();
    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
    let config = PoseidonConfig::default();

-    println!(
-        "---------------------- Input size 2^{}={} ------------------------",
-        size, test_size
-    );
-    let mut inputs = vec![F::one(); test_size * arity as usize];
-    let mut outputs = vec![F::zero(); test_size];
-    let input_slice = HostSlice::from_mut_slice(&mut inputs);
-    let output_slice = HostSlice::from_mut_slice(&mut outputs);
+    println!("---------------------- Input size 2^{}={} ------------------------", size, test_size);
+    let inputs = vec![F::one(); test_size * arity as usize];
+    let outputs = vec![F::zero(); test_size];
+    let mut input_slice = HostOrDeviceSlice::on_host(inputs);
+    let mut output_slice = HostOrDeviceSlice::on_host(outputs);

    println!("Executing BLS12-381 Poseidon Hash on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
    poseidon_hash_many::<F>(
-        input_slice,
-        output_slice,
+        &mut input_slice,
+        &mut output_slice,
        test_size as u32,
        arity as u32,
        &constants,
@@ -55,10 +49,5 @@ fn main() {
    )
    .unwrap();
    #[cfg(feature = "profile")]
-    println!(
-        "ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs",
-        start
-            .elapsed()
-            .as_micros()
-    );
-}
+    println!("ICICLE BLS12-381 Poseidon Hash on size 2^{size} took: {} μs", start.elapsed().as_micros());
+}
--- a/go.mod
+++ b/go.mod
@@ -1,4 +1,4 @@
-module github.com/ingonyama-zk/icicle/v2
+module github.com/ingonyama-zk/icicle

 go 1.20

--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,62 +1,152 @@
 cmake_minimum_required(VERSION 3.18)

+# GoogleTest requires at least C++14
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
+if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+  message(WARNING "Note that PIC (position-independent code) is disabled.")
+else()
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif()
+
+# add the target cuda architectures
+# each additional architecture increases the compilation time and output file size
+if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
+  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+  find_program(_nvidia_smi "nvidia-smi")
+
+  if(_nvidia_smi)
+    set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+
+    # execute nvidia-smi -L to get a short list of GPUs available
+    exec_program(${_nvidia_smi_path} ARGS -L
+      OUTPUT_VARIABLE _nvidia_smi_out
+      RETURN_VALUE _nvidia_smi_ret)
+
+    # process the stdout of nvidia-smi
+    if(_nvidia_smi_ret EQUAL 0)
+      # convert string with newlines to list of strings
+      string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
+
+      foreach(_line ${_nvidia_smi_out})
+        if(_line MATCHES "^GPU [0-9]+:")
+          math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
+
+          # the UUID is not very useful for the user, remove it
+          string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
+
+          if(NOT _gpu_info STREQUAL "")
+            list(APPEND DETECT_GPU_INFO "${_gpu_info}")
+          endif()
+        endif()
+      endforeach()
+
+      check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
+      set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+    endif()
+  endif()
+
+  # ##
+  if(DETECT_GPU_COUNT GREATER 0)
+    set(CMAKE_CUDA_ARCHITECTURES native) # do native
+  else()
+    # no GPUs found, like on Github CI runners
+    set(CMAKE_CUDA_ARCHITECTURES 50) # some safe value
+  endif()
+endif()
+
 project(icicle LANGUAGES CUDA CXX)

-include(cmake/Common.cmake)
-include(cmake/FieldsCommon.cmake)
-include(cmake/CurvesCommon.cmake)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+include_directories("${CMAKE_SOURCE_DIR}")

-set_env()
-set_gpu_env()

-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+# when adding a new curve/field, append its name to the end of this list
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+set(SUPPORTED_CURVES_WITH_POSEIDON bn254;bls12_381;bls12_377;bw6_761;grumpkin)
+SET(SUPPORTED_CURVES_WITHOUT_NTT grumpkin)

-option(DEVMODE "Enable development mode" OFF)
-option(EXT_FIELD "Build extension field" OFF)
-option(G2 "Build G2" OFF)
-option(MSM "Build MSM" ON)
-option(ECNTT "Build ECNTT" OFF)
-option(BUILD_HASH "Build hash functions" OFF)
-option(BUILD_TESTS "Build unit tests" OFF)
-option(BUILD_BENCHMARKS "Build benchmarks" OFF)
-# add options here
+set(IS_CURVE_SUPPORTED FALSE)
+set(I 0)
+foreach (SUPPORTED_CURVE ${SUPPORTED_CURVES})
+  math(EXPR I "${I} + 1")
+  if (CURVE STREQUAL SUPPORTED_CURVE)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DCURVE_ID=${I}")
+    set(IS_CURVE_SUPPORTED TRUE)
+  endif ()
+endforeach()

-if((DEFINED CURVE) AND (DEFINED FIELD))
-  message( FATAL_ERROR "CURVE and FIELD cannot be defined at the same time" )
+if (NOT IS_CURVE_SUPPORTED)
+  message( FATAL_ERROR "The value of CURVE variable: ${CURVE} is not one of the supported curves: ${SUPPORTED_CURVES}" )
 endif ()

-if (DEVMODE)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O0 --ptxas-options=-O0 --ptxas-options=-allow-expensive-optimizations=false -DDEVMODE=ON")
+if (G2_DEFINED STREQUAL "ON")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2_DEFINED=ON")
 endif ()

-if(DEFINED FIELD)
-  check_field()
-  add_subdirectory(src/fields)
+option(BUILD_TESTS "Build tests" OFF)
+
+if (NOT BUILD_TESTS)
+
+  message(STATUS "Building without tests.")
+
+  if (CURVE IN_LIST SUPPORTED_CURVES_WITH_POSEIDON)
+    list(APPEND ICICLE_SOURCES appUtils/poseidon/poseidon.cu)
+    list(APPEND ICICLE_SOURCES appUtils/tree/merkle.cu)
+  endif()
+
+  if (NOT CURVE IN_LIST SUPPORTED_CURVES_WITHOUT_NTT)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/ntt.cu)
+      list(APPEND ICICLE_SOURCES appUtils/ntt/kernel_ntt.cu)
+  endif()
+
+  add_library(
+    icicle
+    utils/vec_ops.cu
+    utils/mont.cu
+    primitives/field.cu
+    primitives/projective.cu
+    appUtils/msm/msm.cu
+    ${ICICLE_SOURCES}
+  )
+  set_target_properties(icicle PROPERTIES OUTPUT_NAME "ingo_${CURVE}")
+  target_compile_definitions(icicle PRIVATE CURVE=${CURVE})  
+
+else()
+
+  message(STATUS "Building tests.")
+
+  include(FetchContent)
+  FetchContent_Declare(
+    googletest
+    URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
+  )
+  # For Windows: Prevent overriding the parent project's compiler/linker settings
+
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+
+  enable_testing()
+
+  add_executable(
+    runner
+    tests/runner.cu
+  )
+
+  target_link_libraries(
+    runner
+    GTest::gtest_main
+  )
+
+  include(GoogleTest)
+  set_target_properties(runner PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+  gtest_discover_tests(runner)
+
 endif ()
-
-if(DEFINED CURVE)
-  check_curve()
-  set(FIELD ${CURVE})
-  add_subdirectory(src/fields)
-  add_subdirectory(src/curves)
-endif ()
-
-if (G2)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DG2")
-endif ()
-
-if (EXT_FIELD)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DEXT_FIELD")
-endif ()
-
-if(BUILD_HASH)
-  add_subdirectory(src/hash)
-endif ()
-
-if (BUILD_TESTS)
-  add_subdirectory(tests)
-endif()
-
-if (BUILD_BENCHMARKS)
-  add_subdirectory(benchmarks)
-endif()
--- a/icicle/appUtils/msm/Makefile
+++ b/icicle/appUtils/msm/Makefile
@@ -0,0 +1,4 @@
+test_msm:
+	mkdir -p work
+	nvcc -o work/test_msm -std=c++17 -I. -I../.. tests/msm_test.cu
+	work/test_msm
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -4,11 +4,12 @@

 #include <cuda_runtime.h>

-#include "curves/affine.cuh"
-#include "curves/projective.cuh"
-#include "fields/field.cuh"
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
+#include "../../curves/curve_config.cuh"
+#include "../../primitives/affine.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/device_context.cuh"
+#include "../../utils/error_handler.cuh"

 /**
 * @namespace msm
@@ -32,7 +33,7 @@ namespace msm {
  /**
   * @struct MSMConfig
   * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
-   * is to create it using [default_msm_config](@ref default_msm_config) function and then you'll hopefully only need to
+   * is to create it using [DefaultMSMConfig](@ref DefaultMSMConfig) function and then you'll hopefully only need to
   * change a small number of default values for each of your MSMs.
   */
  struct MSMConfig {
@@ -42,18 +43,14 @@ namespace msm {
                              *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
                              *   points, it should be set to the product of MSM size and [batch_size](@ref
                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
-    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
-                              *   there needs to be equal to the one used here. Larger values decrease the
+    int precompute_factor;   /**< The number of extra points to pre-compute for each point. Larger values decrease the
                              *   number of computations to make, on-line memory footprint, but increase the static
                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
    int c;                   /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
                              *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
                              *   means more on-line memory footprint but also more parallelism and less computational
-                              *   complexity (up to a certain point). Currently pre-computation is independent of
-                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
-                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
+                              *   complexity (up to a certain point). Default value: 0 (the optimal value of \f$ c \f$
+                              *   is chosen automatically). */
    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
                              *   but if a different (better) upper bound is known, it should be reflected in this
                              *   variable. Default value: 0 (set to the bitsize of scalar field). */
@@ -79,35 +76,14 @@ namespace msm {
                                 *   non-blocking and you'd need to synchronize it explicitly by running
                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
                                 *   function will block the current CPU thread. */
-    bool segments_reduction;
  };

  /**
   * A function that returns the default value of [MSMConfig](@ref MSMConfig) for the [MSM](@ref MSM) function.
   * @return Default value of [MSMConfig](@ref MSMConfig).
   */
-  static MSMConfig
-  default_msm_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
-  {
-    MSMConfig config = {
-      ctx,   // ctx
-      0,     // points_size
-      1,     // precompute_factor
-      0,     // c
-      0,     // bitsize
-      10,    // large_bucket_factor
-      1,     // batch_size
-      false, // are_scalars_on_device
-      false, // are_scalars_montgomery_form
-      false, // are_points_on_device
-      false, // are_points_montgomery_form
-      false, // are_results_on_device
-      false, // is_big_triangle
-      false, // is_async
-      false, // segments_reduction
-    };
-    return config;
-  }
+  template <typename A>
+  MSMConfig DefaultMSMConfig();

  /**
   * A function that computes MSM: \f$ MSM(s_i, P_i) = \sum_{i=1}^N s_i \cdot P_i \f$.
@@ -125,39 +101,12 @@ namespace msm {
   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw-projective.html) point in our codebase.
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   *
+   * **Note:** this function is still WIP and the following [MSMConfig](@ref MSMConfig) members do not yet have any
+   * effect: `precompute_factor` (always equals 1) and `ctx.device_id` (0 device is always used).
+   * Also, it's currently better to use `batch_size=1` in most cases (except with dealing with very many MSMs).
   */
  template <typename S, typename A, typename P>
-  cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);
-
-  /**
-   * A function that precomputes MSM bases by extending them with their shifted copies.
-   * e.g.:
-   * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
-   * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
-   * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
-   * @param bases Bases \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
-   * @param bases_size Number of bases.
-   * @param precompute_factor The number of total precomputed points for each base (including the base itself).
-   * @param _c This is currently unused, but in the future precomputation will need to be aware of
-   * the `c` value used in MSM (see [MSMConfig](@ref MSMConfig)). So to avoid breaking your code with this
-   * upcoming change, make sure to use the same value of `c` in this function and in respective MSMConfig.
-   * @param are_bases_on_device Whether the bases are on device.
-   * @param ctx Device context specifying device id and stream to use.
-   * @param output_bases Device-allocated buffer of size bases_size * precompute_factor for the extended bases.
-   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
-   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   *
-   */
-  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
-    A* bases,
-    int bases_size,
-    int precompute_factor,
-    int _c,
-    bool are_bases_on_device,
-    device_context::DeviceContext& ctx,
-    A* output_bases);
+  cudaError_t MSM(S* scalars, A* points, int msm_size, MSMConfig& config, P* results);

 } // namespace msm

--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -1,9 +1,4 @@
-#include "fields/id.h"
-// #define FIELD_ID 2
-#define CURVE_ID 3
-#include "curves/curve_config.cuh"
-// #include "fields/field_config.cuh"
-
+#define CURVE_ID 1

 #include "msm.cu"

@@ -11,15 +6,11 @@
 #include <iostream>
 #include <vector>

-#include "curves/params/bn254.cuh"
-#include "fields/field.cuh"
-// #include "fields/asm.cu"
-// #include "fields/Chain.cu"
-// #include "fields/MP.cu"
-#include "curves/projective.cuh"
-#include "gpu-utils/device_context.cuh"
-
-// using namespace bn254;
+#include "../../curves/curve_config.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "../../utils/device_context.cuh"

 class Dummy_Scalar
 {
@@ -40,7 +31,7 @@ public:
    return os;
  }

-  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width)
  {
    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
  }
@@ -96,7 +87,7 @@ public:
  {
    Dummy_Projective res = zero();
 #ifdef CUDA_ARCH
-    UNROLL
+#pragma unroll
 #endif
    for (int i = 0; i < Dummy_Scalar::NBITS; i++) {
      if (i > 0) { res = res + res; }
@@ -121,10 +112,6 @@ public:

 // switch between dummy and real:

-// typedef scalar_t test_scalar;
-// typedef projective_t test_projective;
-// typedef affine_t test_affine;
-
 typedef curve_config::scalar_t test_scalar;
 typedef curve_config::projective_t test_projective;
 typedef curve_config::affine_t test_affine;
@@ -133,34 +120,24 @@ typedef curve_config::affine_t test_affine;
 // typedef Dummy_Projective test_projective;
 // typedef Dummy_Projective test_affine;

-int main(int argc, char** argv)
+int main()
 {
-
-  cudaEvent_t start, stop;
-  float msm_time;
-
-  int msm_log_size = (argc > 1) ? atoi(argv[1]) : 18;
-  int msm_size = 1<<msm_log_size;
-  int batch_size = (argc > 2) ? atoi(argv[2]) : 1;
+  int batch_size = 1;
  //   unsigned msm_size = 1<<21;
+  int msm_size = 12180757;
  int N = batch_size * msm_size;
-  int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1;
-  int user_c = (argc > 4) ? atoi(argv[4]) : 16;
-
-  printf("running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n",CURVE_ID,msm_log_size, batch_size, precomp_factor, user_c);

  test_scalar* scalars = new test_scalar[N];
  test_affine* points = new test_affine[N];

-  test_scalar::rand_host_many(scalars, N);
-  test_projective::rand_host_many_affine(points, N);
+  test_scalar::RandHostMany(scalars, N);
+  test_projective::RandHostManyAffine(points, N);

  std::cout << "finished generating" << std::endl;

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective res[batch_size];
-  test_projective ref[batch_size];
+  test_projective large_res[batch_size];
  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];
@@ -173,15 +150,11 @@ int main(int argc, char** argv)

  test_scalar* scalars_d;
  test_affine* points_d;
-  test_affine* precomp_points_d;
-  test_projective* res_d;
-  test_projective* ref_d;
+  test_projective* large_res_d;

  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
-  cudaMalloc(&precomp_points_d, sizeof(test_affine) * msm_size * precomp_factor);
-  cudaMalloc(&res_d, sizeof(test_projective));
-  cudaMalloc(&ref_d, sizeof(test_projective));
+  cudaMalloc(&large_res_d, sizeof(test_projective));
  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);

@@ -200,88 +173,65 @@ int main(int argc, char** argv)
  msm::MSMConfig config = {
    ctx,   // DeviceContext
    0,     // points_size
-    precomp_factor,     // precompute_factor
-    user_c,     // c
+    1,     // precompute_factor
+    0,     // c
    0,     // bitsize
-    100,    // large_bucket_factor
-    batch_size,     // batch_size
+    10,    // large_bucket_factor
+    1,     // batch_size
    false, // are_scalars_on_device
    false, // are_scalars_montgomery_form
-    true, // are_points_on_device
+    false, // are_points_on_device
    false, // are_points_montgomery_form
    true,  // are_results_on_device
    false, // is_big_triangle
    true,  // is_async
-    false,  // segments_reduction
  };

-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  if (precomp_factor > 1) msm::precompute_msm_bases<test_affine, test_projective>(points_d, msm_size, precomp_factor, user_c, false, ctx, precomp_points_d);
-  
-  
-  // warm up
-  msm::msm<test_scalar, test_affine, test_projective>(scalars, precomp_factor > 1? precomp_points_d : points_d, msm_size, config, res_d);
-  cudaDeviceSynchronize();
-
-  // auto begin1 = std::chrono::high_resolution_clock::now();
-  cudaEventRecord(start, stream);
-  msm::msm<test_scalar, test_affine, test_projective>(scalars, precomp_factor > 1? precomp_points_d : points_d, msm_size, config, res_d);
-  cudaEventRecord(stop, stream);
-  cudaStreamSynchronize(stream);
-  cudaEventElapsedTime(&msm_time, start, stop);
-  // cudaEvent_t msm_end_event;
-  // cudaEventCreate(&msm_end_event);
-  // auto end1 = std::chrono::high_resolution_clock::now();
-  // auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
-  printf("msm time : %.3f ms.\n", msm_time);
-
-  //reference
-  config.c = 16;
-  config.precompute_factor = 1;
-  config.segments_reduction = false;
-  msm::msm<test_scalar, test_affine, test_projective>(scalars, points_d, msm_size, config, ref_d);
-
-  // config.is_big_triangle = true;
-  // config.are_results_on_device = false;
-  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
-  // auto begin = std::chrono::high_resolution_clock::now();
-  // msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
+  auto begin1 = std::chrono::high_resolution_clock::now();
+  msm::MSM<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
+  cudaEvent_t msm_end_event;
+  cudaEventCreate(&msm_end_event);
+  auto end1 = std::chrono::high_resolution_clock::now();
+  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  config.is_big_triangle = true;
+  config.are_results_on_device = false;
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+  auto begin = std::chrono::high_resolution_clock::now();
+  msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
  // test_reduce_triangle(scalars);
  // test_reduce_rectangle(scalars);
  // test_reduce_single(scalars);
  // test_reduce_var(scalars);
-  // auto end = std::chrono::high_resolution_clock::now();
-  // auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
-  // printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
+  printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);

-  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+  std::cout << test_projective::to_affine(large_res[0]) << std::endl;

-  cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
-  cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);

  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

  // std::cout<<"final results batched large"<<std::endl;
-  bool success = true;
-  for (unsigned i = 0; i < batch_size; i++)
-  {
-    std::cout<<test_projective::to_affine(res[i])<<std::endl;
-    if (test_projective::to_affine(res[i])==test_projective::to_affine(ref[i])){
-      std::cout<<"good"<<std::endl;
-    }
-    else{
-      std::cout<<"miss"<<std::endl;
-      std::cout<<test_projective::to_affine(ref[i])<<std::endl;
-      success = false;
-    }
-  }
-  if (success){
-    std::cout<<"success!"<<std::endl;
-  }
+  // bool success = true;
+  // for (unsigned i = 0; i < batch_size; i++)
+  // {
+  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
+  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
+  //     std::cout<<"good"<<std::endl;
+  //   }
+  //   else{
+  //     std::cout<<"miss"<<std::endl;
+  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
+  //     success = false;
+  //   }
+  // }
+  // if (success){
+  //   std::cout<<"success!"<<std::endl;
+  // }

  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
@@ -292,4 +242,4 @@ int main(int argc, char** argv)
  // std::cout<<pr<<std::endl;

  return 0;
-}
+}
--- a/icicle/appUtils/ntt/Makefile
+++ b/icicle/appUtils/ntt/Makefile
@@ -0,0 +1,6 @@
+build_verification:
+	mkdir -p work
+	nvcc -o work/test_verification -I. -I.. -I../.. -I../ntt tests/verification.cu -std=c++17
+
+test_verification: build_verification
+	work/test_verification
--- a/icicle/appUtils/ntt/kernel_ntt.cu
+++ b/icicle/appUtils/ntt/kernel_ntt.cu
@@ -1,12 +1,10 @@
-#include "fields/field_config.cuh"

-using namespace field_config;
+#include "appUtils/ntt/thread_ntt.cu"
+#include "curves/curve_config.cuh"
+#include "utils/sharedmem.cuh"
+#include "appUtils/ntt/ntt.cuh" // for Ordering

-#include "thread_ntt.cu"
-#include "gpu-utils/sharedmem.cuh"
-#include "ntt/ntt.cuh" // for ntt::Ordering
-
-namespace mxntt {
+namespace ntt {

  static inline __device__ uint32_t dig_rev(uint32_t num, uint32_t log_size, bool dit, bool fast_tw)
  {
@@ -58,15 +56,7 @@ namespace mxntt {
  // Note: the following reorder kernels are fused with normalization for INTT
  template <typename E, typename S, uint32_t MAX_GROUP_SIZE = 80>
  static __global__ void reorder_digits_inplace_and_normalize_kernel(
-    E* arr,
-    uint32_t log_size,
-    bool columns_batch,
-    uint32_t batch_size,
-    bool dit,
-    bool fast_tw,
-    eRevType rev_type,
-    bool is_normalize,
-    S inverse_N)
+    E* arr, uint32_t log_size, bool dit, bool fast_tw, eRevType rev_type, bool is_normalize, S inverse_N)
  {
    // launch N threads (per batch element)
    // each thread starts from one index and calculates the corresponding group
@@ -75,20 +65,19 @@ namespace mxntt {

    const uint32_t size = 1 << log_size;
    const uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-    const uint32_t idx = columns_batch ? tid / batch_size : tid % size;
-    const uint32_t batch_idx = columns_batch ? tid % batch_size : tid / size;
-    if (tid >= size * batch_size) return;
+    const uint32_t idx = tid % size;
+    const uint32_t batch_idx = tid / size;

    uint32_t next_element = idx;
    uint32_t group[MAX_GROUP_SIZE];
-    group[0] = columns_batch ? next_element * batch_size + batch_idx : next_element + size * batch_idx;
+    group[0] = next_element + size * batch_idx;

    uint32_t i = 1;
    for (; i < MAX_GROUP_SIZE;) {
      next_element = generalized_rev(next_element, log_size, dit, fast_tw, rev_type);
      if (next_element < idx) return; // not handling this group
      if (next_element == idx) break; // calculated whole group
-      group[i++] = columns_batch ? next_element * batch_size + batch_idx : next_element + size * batch_idx;
+      group[i++] = next_element + size * batch_idx;
    }

    --i;
@@ -102,12 +91,9 @@ namespace mxntt {

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void reorder_digits_and_normalize_kernel(
-    const E* arr,
+    E* arr,
    E* arr_reordered,
    uint32_t log_size,
-    bool columns_batch,
-    uint32_t batch_size,
-    uint32_t columns_batch_size,
    bool dit,
    bool fast_tw,
    eRevType rev_type,
@@ -115,55 +101,41 @@ namespace mxntt {
    S inverse_N)
  {
    uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid >= (1 << log_size) * batch_size) return;
    uint32_t rd = tid;
-    uint32_t wr = (columns_batch ? 0 : ((tid >> log_size) << log_size)) +
-                  generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
-    arr_reordered[wr * columns_batch_size + (tid % columns_batch_size)] = is_normalize ? arr[rd] * inverse_N : arr[rd];
+    uint32_t wr =
+      ((tid >> log_size) << log_size) + generalized_rev(tid & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
+    arr_reordered[wr] = is_normalize ? arr[rd] * inverse_N : arr[rd];
  }

  template <typename E, typename S>
-  static __global__ void batch_elementwise_mul_with_reorder_kernel(
-    const E* in_vec,
-    uint32_t size,
-    bool columns_batch,
-    uint32_t batch_size,
-    uint32_t columns_batch_size,
+  static __global__ void batch_elementwise_mul_with_reorder(
+    E* in_vec,
+    int n_elements,
+    int batch_size,
    S* scalar_vec,
    int step,
    int n_scalars,
-    uint32_t log_size,
+    int logn,
    eRevType rev_type,
-    bool fast_tw,
+    bool dit,
    E* out_vec)
  {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid >= size * batch_size) return;
-    int64_t scalar_id = (tid / columns_batch_size) % size;
-    if (rev_type != eRevType::None) {
-      // Note: when we multiply an in_vec that is mixed (by DIF (I)NTT), we want to shuffle the
-      // scalars the same way (then multiply element-wise). This would be a DIT-digit-reverse shuffle. (this is
-      // confusing but) BUT to avoid shuffling the scalars, we instead want to ask which element in the non-shuffled
-      // vec is now placed at index tid, which is the opposite of a DIT-digit-reverse --> this is the DIF-digit-reverse.
-      // Therefore we use the DIF-digit-reverse to know which element moved to index tid and use it to access the
-      // corresponding element in scalars vec.
-      const bool dif = rev_type == eRevType::NaturalToMixedRev;
-      scalar_id =
-        generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, !dif, fast_tw, rev_type);
-    }
+    if (tid >= n_elements * batch_size) return;
+    int64_t scalar_id = tid % n_elements;
+    if (rev_type != eRevType::None) scalar_id = generalized_rev(tid, logn, dit, false, rev_type);
    out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
  }

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void ntt64(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
    uint32_t log_size,
    uint32_t tw_log_size,
-    uint32_t columns_batch_size,
    uint32_t nof_ntt_blocks,
    uint32_t data_stride,
    uint32_t log_data_stride,
@@ -181,27 +153,19 @@ namespace mxntt {

    s_meta.th_stride = 8;
    s_meta.ntt_block_size = 64;
-    s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 7) / 8)
-                                             : (blockIdx.x << 3) + (strided ? (threadIdx.x & 0x7) : (threadIdx.x >> 3));
+    s_meta.ntt_block_id = (blockIdx.x << 3) + (strided ? (threadIdx.x & 0x7) : (threadIdx.x >> 3));
    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 3) : (threadIdx.x & 0x7);

-    s_meta.batch_id =
-      columns_batch_size ? (threadIdx.x & 0x7) + ((blockIdx.x % ((columns_batch_size + 7) / 8)) << 3) : 0;
-    if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
-      return;
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

    if (fast_tw)
      engine.loadBasicTwiddles(basic_twiddles);
    else
      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
-    if (columns_batch_size)
-      engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
-
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride && dit) {
      if (fast_tw)
-        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric64(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -225,28 +189,24 @@ namespace mxntt {

    if (twiddle_stride && !dit) {
      if (fast_tw)
-        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles64(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric64(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    if (columns_batch_size)
-      engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
  }

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void ntt32(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
    uint32_t log_size,
    uint32_t tw_log_size,
-    uint32_t columns_batch_size,
    uint32_t nof_ntt_blocks,
    uint32_t data_stride,
    uint32_t log_data_stride,
@@ -265,25 +225,16 @@ namespace mxntt {

    s_meta.th_stride = 4;
    s_meta.ntt_block_size = 32;
-    s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 15) / 16)
-                                             : (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
+    s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);

-    s_meta.batch_id =
-      columns_batch_size ? (threadIdx.x & 0xf) + ((blockIdx.x % ((columns_batch_size + 15) / 16)) << 4) : 0;
-    if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
-      return;
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

    if (fast_tw)
      engine.loadBasicTwiddles(basic_twiddles);
    else
      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
-
-    if (columns_batch_size)
-      engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
-
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (fast_tw)
      engine.loadInternalTwiddles32(internal_twiddles, strided);
    else
@@ -296,28 +247,24 @@ namespace mxntt {
    engine.ntt4_2();
    if (twiddle_stride) {
      if (fast_tw)
-        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric32(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    if (columns_batch_size)
-      engine.storeGlobalData32ColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.storeGlobalData32(out, data_stride, log_data_stride, strided, s_meta);
+    engine.storeGlobalData32(out, data_stride, log_data_stride, log_size, strided, s_meta);
  }

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void ntt32dit(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
    uint32_t log_size,
    uint32_t tw_log_size,
-    uint32_t columns_batch_size,
    uint32_t nof_ntt_blocks,
    uint32_t data_stride,
    uint32_t log_data_stride,
@@ -336,27 +283,19 @@ namespace mxntt {

    s_meta.th_stride = 4;
    s_meta.ntt_block_size = 32;
-    s_meta.ntt_block_id = columns_batch_size ? blockIdx.x / ((columns_batch_size + 15) / 16)
-                                             : (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
+    s_meta.ntt_block_id = (blockIdx.x << 4) + (strided ? (threadIdx.x & 0xf) : (threadIdx.x >> 2));
    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 4) : (threadIdx.x & 0x3);

-    s_meta.batch_id =
-      columns_batch_size ? (threadIdx.x & 0xf) + ((blockIdx.x % ((columns_batch_size + 15) / 16)) << 4) : 0;
-    if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
-      return;
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

    if (fast_tw)
      engine.loadBasicTwiddles(basic_twiddles);
    else
      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
-
-    if (columns_batch_size)
-      engine.loadGlobalData32ColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.loadGlobalData32(in, data_stride, log_data_stride, strided, s_meta);
+    engine.loadGlobalData32(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride) {
      if (fast_tw)
-        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles32(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric32(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -372,22 +311,18 @@ namespace mxntt {
    engine.SharedData32Rows8(shmem, false, false, strided); // load
    engine.twiddlesInternal();
    engine.ntt8win();
-    if (columns_batch_size)
-      engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
  }

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void ntt16(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
    uint32_t log_size,
    uint32_t tw_log_size,
-    uint32_t columns_batch_size,
    uint32_t nof_ntt_blocks,
    uint32_t data_stride,
    uint32_t log_data_stride,
@@ -406,26 +341,16 @@ namespace mxntt {

    s_meta.th_stride = 2;
    s_meta.ntt_block_size = 16;
-    s_meta.ntt_block_id = columns_batch_size
-                            ? blockIdx.x / ((columns_batch_size + 31) / 32)
-                            : (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
+    s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);

-    s_meta.batch_id =
-      columns_batch_size ? (threadIdx.x & 0x1f) + ((blockIdx.x % ((columns_batch_size + 31) / 32)) << 5) : 0;
-    if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
-      return;
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

    if (fast_tw)
      engine.loadBasicTwiddles(basic_twiddles);
    else
      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
-
-    if (columns_batch_size)
-      engine.loadGlobalDataColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.loadGlobalData(in, data_stride, log_data_stride, strided, s_meta);
-
+    engine.loadGlobalData(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (fast_tw)
      engine.loadInternalTwiddles16(internal_twiddles, strided);
    else
@@ -438,28 +363,24 @@ namespace mxntt {
    engine.ntt2_4();
    if (twiddle_stride) {
      if (fast_tw)
-        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric16(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
      engine.twiddlesExternal();
    }
-    if (columns_batch_size)
-      engine.storeGlobalData16ColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.storeGlobalData16(out, data_stride, log_data_stride, strided, s_meta);
+    engine.storeGlobalData16(out, data_stride, log_data_stride, log_size, strided, s_meta);
  }

  template <typename E, typename S>
  __launch_bounds__(64) __global__ void ntt16dit(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
    uint32_t log_size,
    uint32_t tw_log_size,
-    uint32_t columns_batch_size,
    uint32_t nof_ntt_blocks,
    uint32_t data_stride,
    uint32_t log_data_stride,
@@ -478,29 +399,19 @@ namespace mxntt {

    s_meta.th_stride = 2;
    s_meta.ntt_block_size = 16;
-    s_meta.ntt_block_id = columns_batch_size
-                            ? blockIdx.x / ((columns_batch_size + 31) / 32)
-                            : (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
+    s_meta.ntt_block_id = (blockIdx.x << 5) + (strided ? (threadIdx.x & 0x1f) : (threadIdx.x >> 1));
    s_meta.ntt_inp_id = strided ? (threadIdx.x >> 5) : (threadIdx.x & 0x1);

-    s_meta.batch_id =
-      columns_batch_size ? (threadIdx.x & 0x1f) + ((blockIdx.x % ((columns_batch_size + 31) / 32)) << 5) : 0;
-    if (s_meta.ntt_block_id >= nof_ntt_blocks || (columns_batch_size > 0 && s_meta.batch_id >= columns_batch_size))
-      return;
+    if (s_meta.ntt_block_id >= nof_ntt_blocks) return;

    if (fast_tw)
      engine.loadBasicTwiddles(basic_twiddles);
    else
      engine.loadBasicTwiddlesGeneric(basic_twiddles, inv);
-
-    if (columns_batch_size)
-      engine.loadGlobalData16ColumnBatch(in, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.loadGlobalData16(in, data_stride, log_data_stride, strided, s_meta);
-
+    engine.loadGlobalData16(in, data_stride, log_data_stride, log_size, strided, s_meta);
    if (twiddle_stride) {
      if (fast_tw)
-        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, s_meta);
+        engine.loadExternalTwiddles16(external_twiddles, twiddle_stride, log_data_stride, strided, s_meta);
      else
        engine.loadExternalTwiddlesGeneric16(
          external_twiddles, twiddle_stride, log_data_stride, s_meta, tw_log_size, inv);
@@ -516,17 +427,13 @@ namespace mxntt {
    engine.SharedData16Rows8(shmem, false, false, strided); // load
    engine.twiddlesInternal();
    engine.ntt8win();
-    if (columns_batch_size)
-      engine.storeGlobalDataColumnBatch(out, data_stride, log_data_stride, s_meta, columns_batch_size);
-    else
-      engine.storeGlobalData(out, data_stride, log_data_stride, strided, s_meta);
+    engine.storeGlobalData(out, data_stride, log_data_stride, log_size, strided, s_meta);
  }

  template <typename E, typename S>
-  __global__ void normalize_kernel(E* data, S norm_factor, uint32_t size)
+  __global__ void normalize_kernel(E* data, S norm_factor)
  {
    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= size) return;
    data[tid] = data[tid] * norm_factor;
  }

@@ -751,7 +658,7 @@ namespace mxntt {

  template <typename E, typename S>
  cudaError_t large_ntt(
-    const E* in,
+    E* in,
    E* out,
    S* external_twiddles,
    S* internal_twiddles,
@@ -759,7 +666,6 @@ namespace mxntt {
    uint32_t log_size,
    uint32_t tw_log_size,
    uint32_t batch_size,
-    bool columns_batch,
    bool inv,
    bool normalize,
    bool dit,
@@ -773,83 +679,72 @@ namespace mxntt {
    }

    if (log_size == 4) {
-      const int NOF_THREADS = columns_batch ? 64 : min(64, 2 * batch_size);
-      const int NOF_BLOCKS =
-        columns_batch ? ((batch_size + 31) / 32) : (2 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+      const int NOF_THREADS = min(64, 2 * batch_size);
+      const int NOF_BLOCKS = (2 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+
      if (dit) {
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
-          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
      } else { // dif
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
-          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
      }
-      if (normalize)
-        normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4), (1 << log_size) * batch_size);
+      if (normalize) normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4));
      return CHK_LAST();
    }

    if (log_size == 5) {
-      const int NOF_THREADS = columns_batch ? 64 : min(64, 4 * batch_size);
-      const int NOF_BLOCKS =
-        columns_batch ? ((batch_size + 15) / 16) : (4 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
+      const int NOF_THREADS = min(64, 4 * batch_size);
+      const int NOF_BLOCKS = (4 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
      if (dit) {
        ntt32dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
-          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
      } else { // dif
        ntt32<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
-          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
+          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+          false, 0, inv, dit, fast_tw);
      }
-      if (normalize)
-        normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5), (1 << log_size) * batch_size);
+      if (normalize) normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5));
      return CHK_LAST();
    }

    if (log_size == 6) {
-      const int NOF_THREADS = columns_batch ? 64 : min(64, 8 * batch_size);
-      const int NOF_BLOCKS =
-        columns_batch ? ((batch_size + 7) / 8) : ((8 * batch_size + NOF_THREADS - 1) / NOF_THREADS);
+      const int NOF_THREADS = min(64, 8 * batch_size);
+      const int NOF_BLOCKS = (8 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
      ntt64<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
-        in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-        columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
-      if (normalize)
-        normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6), (1 << log_size) * batch_size);
+        in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size, batch_size, 1, 0, 0,
+        false, 0, inv, dit, fast_tw);
+      if (normalize) normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6));
      return CHK_LAST();
    }

    if (log_size == 8) {
      const int NOF_THREADS = 64;
-      const int NOF_BLOCKS =
-        columns_batch ? ((batch_size + 31) / 32 * 16) : ((32 * batch_size + NOF_THREADS - 1) / NOF_THREADS);
+      const int NOF_BLOCKS = (32 * batch_size + NOF_THREADS - 1) / NOF_THREADS;
      if (dit) {
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1, 0, 0,
-          columns_batch, 0, inv, dit, fast_tw);
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
        ntt16dit<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 16, 4, 16, true, 1,
-          inv, dit, fast_tw);
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
      } else { // dif
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 16, 4, 16, true, 1,
-          inv, dit, fast_tw);
+          (1 << log_size - 4) * batch_size, 16, 4, 16, true, 1, inv, dit, fast_tw);
        ntt16<<<NOF_BLOCKS, NOF_THREADS, 8 * 64 * sizeof(E), cuda_stream>>>(
          out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-          columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1, 0, 0,
-          columns_batch, 0, inv, dit, fast_tw);
+          (1 << log_size - 4) * batch_size, 1, 0, 0, false, 0, inv, dit, fast_tw);
      }
-      if (normalize)
-        normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8), (1 << log_size) * batch_size);
+      if (normalize) normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8));
      return CHK_LAST();
    }

    // general case:
-    uint32_t nof_blocks = (1 << (log_size - 9)) * (columns_batch ? ((batch_size + 31) / 32) * 32 : batch_size);
+    uint32_t nof_blocks = (1 << (log_size - 9)) * batch_size;
    if (dit) {
      for (int i = 0; i < 5; i++) {
        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
@@ -859,18 +754,18 @@ namespace mxntt {
        if (stage_size == 6)
          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 6) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 5)
          ntt32dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 5) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 4)
          ntt16dit<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            i ? out : in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
      }
    } else { // dif
      bool first_run = false, prev_stage = false;
@@ -883,43 +778,40 @@ namespace mxntt {
        if (stage_size == 6)
          ntt64<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 6) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 6) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 5)
          ntt32<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 5) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 5) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        else if (stage_size == 4)
          ntt16<<<nof_blocks, 64, 8 * 64 * sizeof(E), cuda_stream>>>(
            first_run ? in : out, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
-            columns_batch ? batch_size : 0, (1 << log_size - 4) * (columns_batch ? 1 : batch_size), 1 << stride_log,
-            stride_log, i ? (1 << stride_log) : 0, i || columns_batch, i, inv, dit, fast_tw);
+            (1 << log_size - 4) * batch_size, 1 << stride_log, stride_log, i ? (1 << stride_log) : 0, i, i, inv, dit,
+            fast_tw);
        prev_stage = stage_size;
      }
    }
    if (normalize)
-      normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(
-        out, S::inv_log_size(log_size), (1 << log_size) * batch_size);
+      normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(log_size));

    return CHK_LAST();
  }

  template <typename E, typename S>
  cudaError_t mixed_radix_ntt(
-    const E* d_input,
+    E* d_input,
    E* d_output,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
-    S* linear_twiddle, // twiddles organized as [1,w,w^2,...] for coset-eval in fast-tw mode
    int ntt_size,
    int max_logn,
    int batch_size,
-    bool columns_batch,
    bool is_inverse,
    bool fast_tw,
-    ntt::Ordering ordering,
+    Ordering ordering,
    S* arbitrary_coset,
    int coset_gen_index,
    cudaStream_t cuda_stream)
@@ -937,39 +829,38 @@ namespace mxntt {
    eRevType reverse_input = None, reverse_output = None, reverse_coset = None;
    bool dit = false;
    switch (ordering) {
-    case ntt::Ordering::kNN:
+    case Ordering::kNN:
      reverse_input = eRevType::NaturalToMixedRev;
      dit = true;
      break;
-    case ntt::Ordering::kRN:
+    case Ordering::kRN:
      reverse_input = eRevType::RevToMixedRev;
      dit = true;
      reverse_coset = is_inverse ? eRevType::None : eRevType::NaturalToRev;
      break;
-    case ntt::Ordering::kNR:
+    case Ordering::kNR:
      reverse_output = eRevType::MixedRevToRev;
      reverse_coset = is_inverse ? eRevType::NaturalToRev : eRevType::None;
      break;
-    case ntt::Ordering::kRR:
+    case Ordering::kRR:
      reverse_input = eRevType::RevToMixedRev;
      dit = true;
      reverse_output = eRevType::NaturalToRev;
      reverse_coset = eRevType::NaturalToRev;
      break;
-    case ntt::Ordering::kMN:
+    case Ordering::kMN:
      dit = true;
      reverse_coset = is_inverse ? None : eRevType::NaturalToMixedRev;
      break;
-    case ntt::Ordering::kNM:
+    case Ordering::kNM:
      reverse_coset = is_inverse ? eRevType::NaturalToMixedRev : eRevType::None;
      break;
    }

    if (is_on_coset && !is_inverse) {
-      batch_elementwise_mul_with_reorder_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-        d_input, ntt_size, columns_batch, batch_size, columns_batch ? batch_size : 1,
-        arbitrary_coset ? arbitrary_coset : linear_twiddle, arbitrary_coset ? 1 : coset_gen_index, n_twiddles, logn,
-        reverse_coset, fast_tw, d_output);
+      batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+        d_input, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles,
+        arbitrary_coset ? 1 : coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);

      d_input = d_output;
    }
@@ -978,11 +869,10 @@ namespace mxntt {
      const bool is_reverse_in_place = (d_input == d_output);
      if (is_reverse_in_place) {
        reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-          d_output, logn, columns_batch, batch_size, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
+          d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
      } else {
        reorder_digits_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-          d_input, d_output, logn, columns_batch, batch_size, columns_batch ? batch_size : 1, dit, fast_tw,
-          reverse_input, is_normalize, S::inv_log_size(logn));
+          d_input, d_output, logn, dit, fast_tw, reverse_input, is_normalize, S::inv_log_size(logn));
      }
      is_normalize = false;
      d_input = d_output;
@@ -990,19 +880,18 @@ namespace mxntt {

    // inplace ntt
    CHK_IF_RETURN(large_ntt(
-      d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size,
-      columns_batch, is_inverse, (is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));
+      d_input, d_output, external_twiddles, internal_twiddles, basic_twiddles, logn, max_logn, batch_size, is_inverse,
+      (is_normalize && reverse_output == eRevType::None), dit, fast_tw, cuda_stream));

    if (reverse_output != eRevType::None) {
      reorder_digits_inplace_and_normalize_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-        d_output, logn, columns_batch, batch_size, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
+        d_output, logn, dit, fast_tw, reverse_output, is_normalize, S::inv_log_size(logn));
    }

    if (is_on_coset && is_inverse) {
-      batch_elementwise_mul_with_reorder_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
-        d_output, ntt_size, columns_batch, batch_size, columns_batch ? batch_size : 1,
-        arbitrary_coset ? arbitrary_coset : linear_twiddle + n_twiddles, arbitrary_coset ? 1 : -coset_gen_index,
-        n_twiddles, logn, reverse_coset, fast_tw, d_output);
+      batch_elementwise_mul_with_reorder<<<NOF_BLOCKS, NOF_THREADS, 0, cuda_stream>>>(
+        d_output, ntt_size, batch_size, arbitrary_coset ? arbitrary_coset : external_twiddles + n_twiddles,
+        arbitrary_coset ? 1 : -coset_gen_index, n_twiddles, logn, reverse_coset, dit, d_output);
    }

    return CHK_LAST();
@@ -1010,78 +899,35 @@ namespace mxntt {

  // Explicit instantiation for scalar type
  template cudaError_t generate_external_twiddles_generic(
-    const scalar_t& basic_root,
-    scalar_t* external_twiddles,
-    scalar_t*& internal_twiddles,
-    scalar_t*& basic_twiddles,
+    const curve_config::scalar_t& basic_root,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t*& internal_twiddles,
+    curve_config::scalar_t*& basic_twiddles,
    uint32_t log_size,
    cudaStream_t& stream);

  template cudaError_t generate_external_twiddles_fast_twiddles_mode(
-    const scalar_t& basic_root,
-    scalar_t* external_twiddles,
-    scalar_t*& internal_twiddles,
-    scalar_t*& basic_twiddles,
+    const curve_config::scalar_t& basic_root,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t*& internal_twiddles,
+    curve_config::scalar_t*& basic_twiddles,
    uint32_t log_size,
    cudaStream_t& stream);

-  template cudaError_t mixed_radix_ntt<scalar_t, scalar_t>(
-    const scalar_t* d_input,
-    scalar_t* d_output,
-    scalar_t* external_twiddles,
-    scalar_t* internal_twiddles,
-    scalar_t* basic_twiddles,
-    scalar_t* linear_twiddles,
-
+  template cudaError_t mixed_radix_ntt<curve_config::scalar_t, curve_config::scalar_t>(
+    curve_config::scalar_t* d_input,
+    curve_config::scalar_t* d_output,
+    curve_config::scalar_t* external_twiddles,
+    curve_config::scalar_t* internal_twiddles,
+    curve_config::scalar_t* basic_twiddles,
    int ntt_size,
    int max_logn,
    int batch_size,
-    bool columns_batch,
    bool is_inverse,
    bool fast_tw,
-    ntt::Ordering ordering,
-    scalar_t* arbitrary_coset,
+    Ordering ordering,
+    curve_config::scalar_t* arbitrary_coset,
    int coset_gen_index,
    cudaStream_t cuda_stream);

-#if defined(EXT_FIELD)
-  template cudaError_t mixed_radix_ntt<extension_t, scalar_t>(
-    const extension_t* d_input,
-    extension_t* d_output,
-    scalar_t* external_twiddles,
-    scalar_t* internal_twiddles,
-    scalar_t* basic_twiddles,
-    scalar_t* linear_twiddles,
-
-    int ntt_size,
-    int max_logn,
-    int batch_size,
-    bool columns_batch,
-    bool is_inverse,
-    bool fast_tw,
-    ntt::Ordering ordering,
-    scalar_t* arbitrary_coset,
-    int coset_gen_index,
-    cudaStream_t cuda_stream);
-#endif
-
-  // TODO: we may reintroduce mixed-radix ECNTT based on upcoming benching PR
-  // #if defined(ECNTT)
-  //   template cudaError_t mixed_radix_ntt<projective_t, scalar_t>(
-  //     projective_t* d_input,
-  //     projective_t* d_output,
-  //     scalar_t* external_twiddles,
-  //     scalar_t* internal_twiddles,
-  //     scalar_t* basic_twiddles,
-  //     int ntt_size,
-  //     int max_logn,
-  //     int batch_size,
-  //     bool columns_batch,
-  //     bool is_inverse,
-  //     bool fast_tw,
-  //     ntt::Ordering ordering,
-  //     scalar_t* arbitrary_coset,
-  //     int coset_gen_index,
-  //     cudaStream_t cuda_stream);
-  // #endif // ECNTT
-} // namespace mxntt
+} // namespace ntt
--- a/icicle/appUtils/ntt/ntt.cu
+++ b/icicle/appUtils/ntt/ntt.cu
@@ -1,42 +1,27 @@
-#include "fields/field_config.cuh"
-
-using namespace field_config;
-
-#include "ntt/ntt.cuh"
+#include "ntt.cuh"

 #include <unordered_map>
 #include <vector>
-#include <type_traits>

-#include "gpu-utils/sharedmem.cuh"
+#include "curves/curve_config.cuh"
+#include "utils/sharedmem.cuh"
 #include "utils/utils_kernels.cuh"
 #include "utils/utils.h"
-#include "ntt/ntt_impl.cuh"
+#include "appUtils/ntt/ntt_impl.cuh"

 #include <mutex>

-#ifdef CURVE_ID
-#include "curves/curve_config.cuh"
-using namespace curve_config;
-#define IS_ECNTT std::is_same_v<E, projective_t>
-#else
-#define IS_ECNTT false
-#endif
-
 namespace ntt {

  namespace {
-    // TODO: Set MAX THREADS based on GPU arch
-    const uint32_t MAX_NUM_THREADS = 512; // TODO: hotfix - should be 1024, currently limits shared memory size
-    const uint32_t MAX_THREADS_BATCH = 512;
-    const uint32_t MAX_THREADS_BATCH_ECNTT =
-      128; // TODO: hardcoded - allows (2^18 x 64) ECNTT for sm86, decrease this to allow larger ecntt length, batch
-           // size limited by on-device memory
+
+    const uint32_t MAX_NUM_THREADS = 512;   // TODO: hotfix - should be 1024, currently limits shared memory size
+    const uint32_t MAX_THREADS_BATCH = 512; // TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
    const uint32_t MAX_SHARED_MEM_ELEMENT_SIZE = 32; // TODO: occupancy calculator, hardcoded for sm_86..sm_89
    const uint32_t MAX_SHARED_MEM = MAX_SHARED_MEM_ELEMENT_SIZE * MAX_NUM_THREADS;

    template <typename E>
-    __global__ void reverse_order_kernel(const E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
+    __global__ void reverse_order_kernel(E* arr, E* arr_reversed, uint32_t n, uint32_t logn, uint32_t batch_size)
    {
      int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
      if (threadId < n * batch_size) {
@@ -44,14 +29,9 @@ namespace ntt {
        int batch_idx = threadId / n;
        int idx_reversed = __brev(idx) >> (32 - logn);

-        if (arr == arr_reversed) { // for in-place (when pointers arr==arr_reversed)
-          if (idx < idx_reversed) {
-            E val = arr[batch_idx * n + idx];
-            arr_reversed[batch_idx * n + idx] = arr[batch_idx * n + idx_reversed];
-            arr_reversed[batch_idx * n + idx_reversed] = val;
-          }
-        } else
-          arr_reversed[batch_idx * n + idx_reversed] = arr[batch_idx * n + idx];
+        E val = arr[batch_idx * n + idx];
+        if (arr == arr_reversed) { __syncthreads(); } // for in-place (when pointers arr==arr_reversed)
+        arr_reversed[batch_idx * n + idx_reversed] = val;
      }
    }

@@ -66,8 +46,7 @@ namespace ntt {
     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
     */
    template <typename E>
-    void reverse_order_batch(
-      const E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
+    void reverse_order_batch(E* arr_in, uint32_t n, uint32_t logn, uint32_t batch_size, cudaStream_t stream, E* arr_out)
    {
      int number_of_threads = MAX_THREADS_BATCH;
      int number_of_blocks = (n * batch_size + number_of_threads - 1) / number_of_threads;
@@ -84,7 +63,7 @@ namespace ntt {
     * @param arr_out buffer of the same size as `arr_in` on the GPU to write the bit-permuted array into.
     */
    template <typename E>
-    void reverse_order(const E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
+    void reverse_order(E* arr_in, uint32_t n, uint32_t logn, cudaStream_t stream, E* arr_out)
    {
      reverse_order_batch(arr_in, n, logn, 1, stream, arr_out);
    }
@@ -102,7 +81,7 @@ namespace ntt {
     */
    template <typename E, typename S>
    __global__ void ntt_template_kernel_shared_rev(
-      const E* __restrict__ arr_in,
+      E* __restrict__ arr_in,
      int n,
      const S* __restrict__ r_twiddles,
      int n_twiddles,
@@ -123,7 +102,7 @@ namespace ntt {
        uint32_t l = threadIdx.x;

        if (l < loop_limit) {
-          UNROLL
+#pragma unroll
          for (; ss < logn; ss++) {
            int s = logn - ss - 1;
            bool is_beginning = ss == 0;
@@ -174,7 +153,7 @@ namespace ntt {
     */
    template <typename E, typename S>
    __global__ void ntt_template_kernel_shared(
-      const E* __restrict__ arr_in,
+      E* __restrict__ arr_in,
      int n,
      const S* __restrict__ r_twiddles,
      int n_twiddles,
@@ -195,7 +174,7 @@ namespace ntt {
        uint32_t l = threadIdx.x;

        if (l < loop_limit) {
-          UNROLL
+#pragma unroll
          for (; s < logn; s++) // TODO: this loop also can be unrolled
          {
            uint32_t ntw_i = task % chunks;
@@ -242,7 +221,7 @@ namespace ntt {
     */
    template <typename E, typename S>
    __global__ void
-    ntt_template_kernel(const E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
+    ntt_template_kernel(E* arr_in, int n, S* twiddles, int n_twiddles, int max_task, int s, bool rev, E* arr_out)
    {
      int task = blockIdx.x;
      int chunks = n / (blockDim.x * 2);
@@ -294,7 +273,7 @@ namespace ntt {
     */
    template <typename E, typename S>
    cudaError_t ntt_inplace_batch_template(
-      const E* d_input,
+      E* d_input,
      int n,
      S* d_twiddles,
      int n_twiddles,
@@ -311,8 +290,7 @@ namespace ntt {

      bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
      const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn;
-      int max_threads_batch = IS_ECNTT ? MAX_THREADS_BATCH_ECNTT : MAX_THREADS_BATCH;
-      int num_threads = max(min(min(n / 2, max_threads_batch), 1 << (log2_shmem_elems - 1)), 1);
+      int num_threads = max(min(min(n / 2, MAX_THREADS_BATCH), 1 << (log2_shmem_elems - 1)), 1);
      const int chunks = max(int((n / 2) / num_threads), 1);
      const int total_tasks = batch_size * chunks;
      int num_blocks = total_tasks;
@@ -378,7 +356,7 @@ namespace ntt {
  /**
   * @struct Domain
   * Struct containing information about the domain on which (i)NTT is evaluated i.e. twiddle factors.
-   * Twiddle factors are private, static and can only be set using [init_domain](@ref init_domain) function.
+   * Twiddle factors are private, static and can only be set using [InitDomain](@ref InitDomain) function.
   * The internal representation of twiddles is prone to change in accordance with changing [NTT](@ref NTT) algorithm.
   * @tparam S The type of twiddle factors \f$ \{ \omega^i \} \f$. Must be a field.
   */
@@ -387,7 +365,7 @@ namespace ntt {
  {
    // Mutex for protecting access to the domain/device container array
    static inline std::mutex device_domain_mutex;
-    // The domain-per-device container - assumption is init_domain is called once per device per program.
+    // The domain-per-device container - assumption is InitDomain is called once per device per program.

    int max_size = 0;
    int max_log_size = 0;
@@ -408,26 +386,19 @@ namespace ntt {

  public:
    template <typename U>
-    friend cudaError_t init_domain<U>(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);
+    friend cudaError_t InitDomain<U>(U primitive_root, device_context::DeviceContext& ctx, bool fast_tw);

-    template <typename U>
-    friend cudaError_t release_domain(device_context::DeviceContext& ctx);
-
-    template <typename U>
-    friend U get_root_of_unity<U>(uint64_t logn, device_context::DeviceContext& ctx);
-
-    template <typename U>
-    friend U get_root_of_unity_from_domain<U>(uint64_t logn, device_context::DeviceContext& ctx);
+    cudaError_t ReleaseDomain(device_context::DeviceContext& ctx);

    template <typename U, typename E>
-    friend cudaError_t ntt<U, E>(const E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
+    friend cudaError_t NTT<U, E>(E* input, int size, NTTDir dir, NTTConfig<U>& config, E* output);
  };

  template <typename S>
  static inline Domain<S> domains_for_devices[device_context::MAX_DEVICES] = {};

  template <typename S>
-  cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
  {
    CHK_INIT_IF_RETURN();

@@ -465,7 +436,7 @@ namespace ntt {
      // Note: radix-2 INTT needs ONE in last element (in addition to first element), therefore have n+1 elements
      // Managed allocation allows host to read the elements (logn) without copying all (n) TFs back to host
      CHK_IF_RETURN(cudaMallocManaged(&domain.twiddles, (domain.max_size + 1) * sizeof(S)));
-      CHK_IF_RETURN(mxntt::generate_external_twiddles_generic(
+      CHK_IF_RETURN(generate_external_twiddles_generic(
        primitive_root, domain.twiddles, domain.internal_twiddles, domain.basic_twiddles, domain.max_log_size,
        ctx.stream));

@@ -475,7 +446,7 @@ namespace ntt {
        CHK_IF_RETURN(cudaMallocAsync(&domain.fast_external_twiddles_inv, domain.max_size * sizeof(S) * 2, ctx.stream));

        // fast-twiddles forward NTT
-        CHK_IF_RETURN(mxntt::generate_external_twiddles_fast_twiddles_mode(
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
          primitive_root, domain.fast_external_twiddles, domain.fast_internal_twiddles, domain.fast_basic_twiddles,
          domain.max_log_size, ctx.stream));

@@ -483,7 +454,7 @@ namespace ntt {
        S primitive_root_inv;
        CHK_IF_RETURN(cudaMemcpyAsync(
          &primitive_root_inv, &domain.twiddles[domain.max_size - 1], sizeof(S), cudaMemcpyDeviceToHost, ctx.stream));
-        CHK_IF_RETURN(mxntt::generate_external_twiddles_fast_twiddles_mode(
+        CHK_IF_RETURN(generate_external_twiddles_fast_twiddles_mode(
          primitive_root_inv, domain.fast_external_twiddles_inv, domain.fast_internal_twiddles_inv,
          domain.fast_basic_twiddles_inv, domain.max_log_size, ctx.stream));
      }
@@ -511,77 +482,46 @@ namespace ntt {
  }

  template <typename S>
-  cudaError_t release_domain(device_context::DeviceContext& ctx)
+  cudaError_t Domain<S>::ReleaseDomain(device_context::DeviceContext& ctx)
  {
    CHK_INIT_IF_RETURN();

-    Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
+    max_size = 0;
+    max_log_size = 0;
+    cudaFreeAsync(twiddles, ctx.stream);
+    twiddles = nullptr;
+    cudaFreeAsync(internal_twiddles, ctx.stream);
+    internal_twiddles = nullptr;
+    cudaFreeAsync(basic_twiddles, ctx.stream);
+    basic_twiddles = nullptr;
+    coset_index.clear();

-    domain.max_size = 0;
-    domain.max_log_size = 0;
-    domain.twiddles = nullptr; // allocated via cudaMallocManaged(...) so released without calling cudaFree(...)
-    CHK_IF_RETURN(cudaFreeAsync(domain.internal_twiddles, ctx.stream));
-    domain.internal_twiddles = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.basic_twiddles, ctx.stream));
-    domain.basic_twiddles = nullptr;
-    domain.coset_index.clear();
-
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_external_twiddles, ctx.stream));
-    domain.fast_external_twiddles = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_internal_twiddles, ctx.stream));
-    domain.fast_internal_twiddles = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_basic_twiddles, ctx.stream));
-    domain.fast_basic_twiddles = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_external_twiddles_inv, ctx.stream));
-    domain.fast_external_twiddles_inv = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_internal_twiddles_inv, ctx.stream));
-    domain.fast_internal_twiddles_inv = nullptr;
-    CHK_IF_RETURN(cudaFreeAsync(domain.fast_basic_twiddles_inv, ctx.stream));
-    domain.fast_basic_twiddles_inv = nullptr;
-    domain.initialized = false;
+    cudaFreeAsync(fast_external_twiddles, ctx.stream);
+    fast_external_twiddles = nullptr;
+    cudaFreeAsync(fast_internal_twiddles, ctx.stream);
+    fast_internal_twiddles = nullptr;
+    cudaFreeAsync(fast_basic_twiddles, ctx.stream);
+    fast_basic_twiddles = nullptr;
+    cudaFreeAsync(fast_external_twiddles_inv, ctx.stream);
+    fast_external_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_internal_twiddles_inv, ctx.stream);
+    fast_internal_twiddles_inv = nullptr;
+    cudaFreeAsync(fast_basic_twiddles_inv, ctx.stream);
+    fast_basic_twiddles_inv = nullptr;

    return CHK_LAST();
  }

  template <typename S>
-  S get_root_of_unity(uint64_t max_size)
-  {
-    // ceil up
-    const auto log_max_size = static_cast<uint32_t>(std::ceil(std::log2(max_size)));
-    return S::omega(log_max_size);
-  }
-  // explicit instantiation to avoid having to include this file
-  template scalar_t get_root_of_unity(uint64_t logn);
-
-  template <typename S>
-  S get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx)
-  {
-    Domain<S>& domain = domains_for_devices<S>[ctx.device_id];
-    if (logn > domain.max_log_size) {
-      std::ostringstream oss;
-      oss << "NTT log_size=" << logn
-          << " is too large for the domain. Consider generating your domain with a higher order root of unity.\n";
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, oss.str().c_str());
-    }
-    const size_t twiddles_idx = 1ULL << (domain.max_log_size - logn);
-    return domain.twiddles[twiddles_idx];
-  }
-  // explicit instantiation to avoid having to include this file
-  template scalar_t get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
-
-  template <typename S>
-  static bool is_choosing_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
+  static bool is_choose_radix2_algorithm(int logn, int batch_size, const NTTConfig<S>& config)
  {
    const bool is_mixed_radix_alg_supported = (logn > 3 && logn != 7);
-    if (!is_mixed_radix_alg_supported && config.columns_batch)
-      throw IcicleError(IcicleError_t::InvalidArgument, "columns batch is not supported for given NTT size");
    const bool is_user_selected_radix2_alg = config.ntt_algorithm == NttAlgorithm::Radix2;
    const bool is_force_radix2 = !is_mixed_radix_alg_supported || is_user_selected_radix2_alg;
    if (is_force_radix2) return true;

    const bool is_user_selected_mixed_radix_alg = config.ntt_algorithm == NttAlgorithm::MixedRadix;
    if (is_user_selected_mixed_radix_alg) return false;
-    if (config.columns_batch) return false; // radix2 does not currently support columns batch mode.

    // Heuristic to automatically select an algorithm
    // Note that generally the decision depends on {logn, batch, ordering, inverse, coset, in-place, coeff-field} and
@@ -597,7 +537,7 @@ namespace ntt {

  template <typename S, typename E>
  cudaError_t radix2_ntt(
-    const E* d_input,
+    E* d_input,
    E* d_output,
    S* twiddles,
    int ntt_size,
@@ -629,6 +569,7 @@ namespace ntt {
      break;
    case Ordering::kRN:
    case Ordering::kMN:
+      dit = true;
      reverse_input = false;
    }

@@ -642,7 +583,7 @@ namespace ntt {
  }

  template <typename S, typename E>
-  cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
+  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output)
  {
    CHK_INIT_IF_RETURN();

@@ -669,22 +610,18 @@ namespace ntt {
    bool are_inputs_on_device = config.are_inputs_on_device;
    bool are_outputs_on_device = config.are_outputs_on_device;

-    const E* d_input;
-    E* d_allocated_input = nullptr;
+    E* d_input;
    if (are_inputs_on_device) {
      d_input = input;
    } else {
-      CHK_IF_RETURN(cudaMallocAsync(&d_allocated_input, input_size_bytes, stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_allocated_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream));
-      d_input = d_allocated_input;
+      CHK_IF_RETURN(cudaMallocAsync(&d_input, input_size_bytes, stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_input, input, input_size_bytes, cudaMemcpyHostToDevice, stream));
    }
    E* d_output;
-    E* d_allocated_output = nullptr;
    if (are_outputs_on_device) {
      d_output = output;
    } else {
-      CHK_IF_RETURN(cudaMallocAsync(&d_allocated_output, input_size_bytes, stream));
-      d_output = d_allocated_output;
+      CHK_IF_RETURN(cudaMallocAsync(&d_output, input_size_bytes, stream));
    }

    S* coset = nullptr;
@@ -704,56 +641,50 @@ namespace ntt {
      h_coset.clear();
    }

+    const bool is_radix2_algorithm = is_choose_radix2_algorithm(logn, batch_size, config);
    const bool is_inverse = dir == NTTDir::kInverse;

-    if constexpr (IS_ECNTT) {
+    if (is_radix2_algorithm) {
      CHK_IF_RETURN(ntt::radix2_ntt(
        d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
        coset_index, stream));
    } else {
-      const bool is_radix2_algorithm = is_choosing_radix2_algorithm(logn, batch_size, config);
-      if (is_radix2_algorithm) {
-        CHK_IF_RETURN(ntt::radix2_ntt(
-          d_input, d_output, domain.twiddles, size, domain.max_size, batch_size, is_inverse, config.ordering, coset,
-          coset_index, stream));
-      } else {
-        const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr);
-        S* twiddles = is_fast_twiddles_enabled
-                        ? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
-                        : domain.twiddles;
-        S* internal_twiddles = is_fast_twiddles_enabled
-                                 ? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
-                                 : domain.internal_twiddles;
-        S* basic_twiddles = is_fast_twiddles_enabled
-                              ? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
-                              : domain.basic_twiddles;
-        S* linear_twiddles = domain.twiddles; // twiddles organized as [1,w,w^2,...]
-        CHK_IF_RETURN(mxntt::mixed_radix_ntt(
-          d_input, d_output, twiddles, internal_twiddles, basic_twiddles, linear_twiddles, size, domain.max_log_size,
-          batch_size, config.columns_batch, is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index,
-          stream));
-      }
+      const bool is_on_coset = (coset_index != 0) || coset;
+      const bool is_fast_twiddles_enabled = (domain.fast_external_twiddles != nullptr) && !is_on_coset;
+      S* twiddles = is_fast_twiddles_enabled
+                      ? (is_inverse ? domain.fast_external_twiddles_inv : domain.fast_external_twiddles)
+                      : domain.twiddles;
+      S* internal_twiddles = is_fast_twiddles_enabled
+                               ? (is_inverse ? domain.fast_internal_twiddles_inv : domain.fast_internal_twiddles)
+                               : domain.internal_twiddles;
+      S* basic_twiddles = is_fast_twiddles_enabled
+                            ? (is_inverse ? domain.fast_basic_twiddles_inv : domain.fast_basic_twiddles)
+                            : domain.basic_twiddles;
+
+      CHK_IF_RETURN(ntt::mixed_radix_ntt(
+        d_input, d_output, twiddles, internal_twiddles, basic_twiddles, size, domain.max_log_size, batch_size,
+        is_inverse, is_fast_twiddles_enabled, config.ordering, coset, coset_index, stream));
    }

    if (!are_outputs_on_device)
      CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, input_size_bytes, cudaMemcpyDeviceToHost, stream));

    if (coset) CHK_IF_RETURN(cudaFreeAsync(coset, stream));
-    if (d_allocated_input) CHK_IF_RETURN(cudaFreeAsync(d_allocated_input, stream));
-    if (d_allocated_output) CHK_IF_RETURN(cudaFreeAsync(d_allocated_output, stream));
+    if (!are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_input, stream));
+    if (!are_outputs_on_device) CHK_IF_RETURN(cudaFreeAsync(d_output, stream));
    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));

    return CHK_LAST();
  }

  template <typename S>
-  NTTConfig<S> default_ntt_config(const device_context::DeviceContext& ctx)
+  NTTConfig<S> DefaultNTTConfig()
  {
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
    NTTConfig<S> config = {
      ctx,                // ctx
      S::one(),           // coset_gen
      1,                  // batch_size
-      false,              // columns_batch
      Ordering::kNN,      // ordering
      false,              // are_inputs_on_device
      false,              // are_outputs_on_device
@@ -762,6 +693,53 @@ namespace ntt {
    };
    return config;
  }
-  // explicit instantiation to avoid having to include this file
-  template NTTConfig<scalar_t> default_ntt_config(const device_context::DeviceContext& ctx);
+
+  /**
+   * Extern "C" version of [InitDomain](@ref InitDomain) function with the following
+   * value of template parameter (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [scalar field](@ref scalar_t) of the curve;
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitializeDomain)(
+    curve_config::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode)
+  {
+    return InitDomain(*primitive_root, ctx, fast_twiddles_mode);
+  }
+
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` and `E` are both the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, NTTCuda)(
+    curve_config::scalar_t* input,
+    int size,
+    NTTDir dir,
+    NTTConfig<curve_config::scalar_t>& config,
+    curve_config::scalar_t* output)
+  {
+    return NTT<curve_config::scalar_t, curve_config::scalar_t>(input, size, dir, config, output);
+  }
+
+#if defined(ECNTT_DEFINED)
+
+  /**
+   * Extern "C" version of [NTT](@ref NTT) function with the following values of template parameters
+   * (where the curve is given by `-DCURVE` env variable during build):
+   *  - `S` is the [projective representation](@ref projective_t) of the curve (i.e. EC NTT is computed);
+   *  - `E` is the [scalar field](@ref scalar_t) of the curve;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, ECNTTCuda)(
+    curve_config::projective_t* input,
+    int size,
+    NTTDir dir,
+    NTTConfig<curve_config::scalar_t>& config,
+    curve_config::projective_t* output)
+  {
+    return NTT<curve_config::scalar_t, curve_config::projective_t>(input, size, dir, config, output);
+  }
+
+#endif
+
 } // namespace ntt
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -4,9 +4,10 @@

 #include <cuda_runtime.h>

-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
-#include "gpu-utils/sharedmem.cuh"
+#include "curves/curve_config.cuh"
+#include "utils/device_context.cuh"
+#include "utils/error_handler.cuh"
+#include "utils/sharedmem.cuh"
 #include "utils/utils_kernels.cuh"
 #include "utils/utils.h"

@@ -37,40 +38,7 @@ namespace ntt {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename S>
-  cudaError_t init_domain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);
-
-  /**
-   * Releases and deallocates resources associated with the domain initialized for performing NTTs.
-   * This function should be called to clean up resources once they are no longer needed.
-   * It's important to note that after calling this function, any operation that relies on the released domain will
-   * fail unless init_domain is called again to reinitialize the resources. Therefore, ensure that release_domain is
-   * only called when the operations requiring the NTT domain are completely finished and the domain is no longer
-   * needed.
-   * Also note that it is releasing the domain associated to the specific device.
-   * @param ctx Details related to the device context such as its id and stream id.
-   * @return `cudaSuccess` if the resource release was successful, indicating that the domain and its associated
-   * resources have been properly deallocated. Returns an error code otherwise, indicating failure to release
-   * the resources. The error code can be used to diagnose the problem.
-   * */
-  template <typename S>
-  cudaError_t release_domain(device_context::DeviceContext& ctx);
-
-  /* Returns the basic root of unity Wn
-   * @param logn log size of the required root.
-   * @return Wn root of unity
-   */
-  template <typename S>
-  S get_root_of_unity(uint64_t max_size);
-
-  /* Returns the basic root of unity Wn corresponding to the basic root used to initialize the domain.
-   * This function can be called only after InitializeDomain()!
-   * Useful when computing NTT on cosets. In that case we must use the root W_2n that is between W_n and W_n+1.
-   * @param logn log size of the required root.
-   * @param ctx Details related to the device such as its id and stream id.
-   * @return Wn root of unity corresponding to logn and the basic root used for initDomain(root)
-   */
-  template <typename S>
-  S get_root_of_unity_from_domain(uint64_t logn, device_context::DeviceContext& ctx);
+  cudaError_t InitDomain(S primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode = false);

  /**
   * @enum NTTDir
@@ -127,8 +95,6 @@ namespace ntt {
    S coset_gen;                       /**< Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()`
                                        *   (corresponding to no coset being used). */
    int batch_size;                    /**< The number of NTTs to compute. Default value: 1. */
-    bool columns_batch;                /**< True if the batches are the columns of an input matrix
-                                       (they are strided in memory with a stride of ntt size) Default value: false.  */
    Ordering ordering;          /**< Ordering of inputs and outputs. See [Ordering](@ref Ordering). Default value:
                                 *   `Ordering::kNN`. */
    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
@@ -146,12 +112,11 @@ namespace ntt {
   * @return Default value of [NTTConfig](@ref NTTConfig).
   */
  template <typename S>
-  NTTConfig<S>
-  default_ntt_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context());
+  NTTConfig<S> DefaultNTTConfig();

  /**
-   * A function that computes NTT or iNTT in-place. It's necessary to call [init_domain](@ref init_domain) with an
-   * appropriate primitive root before calling this function (only one call to `init_domain` should suffice for all
+   * A function that computes NTT or iNTT in-place. It's necessary to call [InitDomain](@ref InitDomain) with an
+   * appropriate primitive root before calling this function (only one call to `InitDomain` should suffice for all
   * NTTs).
   * @param input Input of the NTT. Length of this array needs to be \f$ size \cdot config.batch\_size \f$. Note
   * that if inputs are in Montgomery form, the outputs will be as well and vice-versa: non-Montgomery inputs produce
@@ -167,7 +132,7 @@ namespace ntt {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename S, typename E>
-  cudaError_t ntt(const E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);
+  cudaError_t NTT(E* input, int size, NTTDir dir, NTTConfig<S>& config, E* output);

 } // namespace ntt

--- a/icicle/appUtils/ntt/ntt_impl.cuh
+++ b/icicle/appUtils/ntt/ntt_impl.cuh
@@ -3,9 +3,9 @@
 #define _NTT_IMPL_H

 #include <stdint.h>
-#include "ntt/ntt.cuh" // for enum Ordering
+#include "appUtils/ntt/ntt.cuh" // for enum Ordering

-namespace mxntt {
+namespace ntt {

  template <typename S>
  cudaError_t generate_external_twiddles_generic(
@@ -27,22 +27,20 @@ namespace mxntt {

  template <typename E, typename S>
  cudaError_t mixed_radix_ntt(
-    const E* d_input,
+    E* d_input,
    E* d_output,
    S* external_twiddles,
    S* internal_twiddles,
    S* basic_twiddles,
-    S* linear_twiddle, // twiddles organized as [1,w,w^2,...] for coset-eval in fast-tw mode
    int ntt_size,
    int max_logn,
    int batch_size,
-    bool columns_batch,
    bool is_inverse,
    bool fast_tw,
-    ntt::Ordering ordering,
+    Ordering ordering,
    S* arbitrary_coset,
    int coset_gen_index,
    cudaStream_t cuda_stream);

-} // namespace mxntt
+} // namespace ntt
 #endif //_NTT_IMPL_H
--- a/icicle/appUtils/ntt/tests/verification.cu
+++ b/icicle/appUtils/ntt/tests/verification.cu
@@ -1,27 +1,21 @@
-#include "fields/id.h"
-#define FIELD_ID BN254

-#ifdef ECNTT
-#define CURVE_ID BN254
-#include "curves/curve_config.cuh"
-typedef field_config::scalar_t test_scalar;
-typedef curve_config::projective_t test_data;
-#else
-#include "fields/field_config.cuh"
-typedef field_config::scalar_t test_scalar;
-typedef field_config::scalar_t test_data;
-#endif
+#define CURVE_ID BLS12_381

-#include "fields/field.cuh"
-#include "curves/projective.cuh"
+#include "primitives/field.cuh"
+#include "primitives/projective.cuh"
 #include <chrono>
 #include <iostream>
 #include <vector>

-#include "ntt.cu"
-#include "kernel_ntt.cu"
+#include "curves/curve_config.cuh"
+#include "ntt/ntt.cu"
+#include "ntt/ntt_impl.cuh"
 #include <memory>

+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_data;
+#include "kernel_ntt.cu"
+
 void random_samples(test_data* res, uint32_t count)
 {
  for (int i = 0; i < count; i++)
@@ -35,13 +29,6 @@ void incremental_values(test_scalar* res, uint32_t count)
  }
 }

-__global__ void transpose_batch(test_scalar* in, test_scalar* out, int row_size, int column_size)
-{
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid >= row_size * column_size) return;
-  out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
-}
-
 int main(int argc, char** argv)
 {
  cudaEvent_t icicle_start, icicle_stop, new_start, new_stop;
@@ -50,12 +37,11 @@ int main(int argc, char** argv)
  int NTT_LOG_SIZE = (argc > 1) ? atoi(argv[1]) : 19;
  int NTT_SIZE = 1 << NTT_LOG_SIZE;
  bool INPLACE = (argc > 2) ? atoi(argv[2]) : false;
-  int INV = (argc > 3) ? atoi(argv[3]) : false;
-  int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 150;
-  bool COLUMNS_BATCH = (argc > 5) ? atoi(argv[5]) : false;
-  int COSET_IDX = (argc > 6) ? atoi(argv[6]) : 2;
-  const ntt::Ordering ordering = (argc > 7) ? ntt::Ordering(atoi(argv[7])) : ntt::Ordering::kNN;
-  bool FAST_TW = (argc > 8) ? atoi(argv[8]) : true;
+  int INV = (argc > 3) ? atoi(argv[3]) : true;
+  int BATCH_SIZE = (argc > 4) ? atoi(argv[4]) : 1;
+  int COSET_IDX = (argc > 5) ? atoi(argv[5]) : 0;
+  const ntt::Ordering ordering = (argc > 6) ? ntt::Ordering(atoi(argv[6])) : ntt::Ordering::kNN;
+  bool FAST_TW = (argc > 7) ? atoi(argv[7]) : true;

  // Note: NM, MN are not expected to be equal when comparing mixed-radix and radix-2 NTTs
  const char* ordering_str = ordering == ntt::Ordering::kNN   ? "NN"
@@ -66,18 +52,17 @@ int main(int argc, char** argv)
                                                              : "MN";

  printf(
-    "running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, columns_batch=%d coset-idx=%d, ordering=%s, fast_tw=%d\n",
-    NTT_LOG_SIZE, INPLACE, INV, BATCH_SIZE, COLUMNS_BATCH, COSET_IDX, ordering_str, FAST_TW);
+    "running ntt 2^%d, inplace=%d, inverse=%d, batch_size=%d, coset-idx=%d, ordering=%s, fast_tw=%d\n", NTT_LOG_SIZE,
+    INPLACE, INV, BATCH_SIZE, COSET_IDX, ordering_str, FAST_TW);

  CHK_IF_RETURN(cudaFree(nullptr)); // init GPU context (warmup)

  // init domain
-  auto ntt_config = ntt::default_ntt_config<test_scalar>();
+  auto ntt_config = ntt::DefaultNTTConfig<test_scalar>();
  ntt_config.ordering = ordering;
  ntt_config.are_inputs_on_device = true;
  ntt_config.are_outputs_on_device = true;
  ntt_config.batch_size = BATCH_SIZE;
-  ntt_config.columns_batch = COLUMNS_BATCH;

  CHK_IF_RETURN(cudaEventCreate(&icicle_start));
  CHK_IF_RETURN(cudaEventCreate(&icicle_stop));
@@ -85,8 +70,8 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaEventCreate(&new_stop));

  auto start = std::chrono::high_resolution_clock::now();
-  const scalar_t basic_root = test_scalar::omega(NTT_LOG_SIZE);
-  ntt::init_domain(basic_root, ntt_config.ctx, FAST_TW);
+  const test_scalar basic_root = test_scalar::omega(NTT_LOG_SIZE);
+  ntt::InitDomain(basic_root, ntt_config.ctx, FAST_TW);
  auto stop = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
  std::cout << "initDomain took: " << duration / 1000 << " MS" << std::endl;
@@ -97,10 +82,8 @@ int main(int argc, char** argv)
  auto CpuOutputNew = std::make_unique<test_data[]>(NTT_SIZE * BATCH_SIZE);

  // gpu allocation
-  scalar_t *GpuScalars, *GpuOutputOld, *GpuOutputNew;
-  scalar_t* GpuScalarsTransposed;
+  test_data *GpuScalars, *GpuOutputOld, *GpuOutputNew;
  CHK_IF_RETURN(cudaMalloc(&GpuScalars, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
-  CHK_IF_RETURN(cudaMalloc(&GpuScalarsTransposed, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
  CHK_IF_RETURN(cudaMalloc(&GpuOutputOld, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));
  CHK_IF_RETURN(cudaMalloc(&GpuOutputNew, sizeof(test_data) * NTT_SIZE * BATCH_SIZE));

@@ -110,16 +93,10 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(
    cudaMemcpy(GpuScalars, CpuScalars.get(), NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyHostToDevice));

-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuScalars, GpuScalarsTransposed, NTT_SIZE, BATCH_SIZE);
-  }
-
  // inplace
  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
+    CHK_IF_RETURN(
+      cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
  }

  for (int coset_idx = 0; coset_idx < COSET_IDX; ++coset_idx) {
@@ -131,26 +108,26 @@ int main(int argc, char** argv)
    CHK_IF_RETURN(cudaEventRecord(new_start, ntt_config.ctx.stream));
    ntt_config.ntt_algorithm = ntt::NttAlgorithm::MixedRadix;
    for (size_t i = 0; i < iterations; i++) {
-      CHK_IF_RETURN(ntt::ntt(
-        INPLACE         ? GpuOutputNew
-        : COLUMNS_BATCH ? GpuScalarsTransposed
-                        : GpuScalars,
-        NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputNew));
+      CHK_IF_RETURN(ntt::NTT(
+        INPLACE ? GpuOutputNew : GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config,
+        GpuOutputNew));
    }
    CHK_IF_RETURN(cudaEventRecord(new_stop, ntt_config.ctx.stream));
    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
    CHK_IF_RETURN(cudaEventElapsedTime(&new_time, new_start, new_stop));
+    if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }

    // OLD
    CHK_IF_RETURN(cudaEventRecord(icicle_start, ntt_config.ctx.stream));
    ntt_config.ntt_algorithm = ntt::NttAlgorithm::Radix2;
    for (size_t i = 0; i < iterations; i++) {
      CHK_IF_RETURN(
-        ntt::ntt(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
+        ntt::NTT(GpuScalars, NTT_SIZE, INV ? ntt::NTTDir::kInverse : ntt::NTTDir::kForward, ntt_config, GpuOutputOld));
    }
    CHK_IF_RETURN(cudaEventRecord(icicle_stop, ntt_config.ctx.stream));
    CHK_IF_RETURN(cudaStreamSynchronize(ntt_config.ctx.stream));
    CHK_IF_RETURN(cudaEventElapsedTime(&icicle_time, icicle_start, icicle_stop));
+    if (is_print) { fprintf(stderr, "cuda err %d\n", cudaGetLastError()); }

    if (is_print) {
      printf("Old Runtime=%0.3f MS\n", icicle_time / iterations);
@@ -163,19 +140,11 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(benchmark(false /*=print*/, 1)); // warmup
  int count = INPLACE ? 1 : 10;
  if (INPLACE) {
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, COLUMNS_BATCH ? GpuScalarsTransposed : GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data),
-      cudaMemcpyDeviceToDevice));
+    CHK_IF_RETURN(
+      cudaMemcpy(GpuOutputNew, GpuScalars, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
  }
  CHK_IF_RETURN(benchmark(true /*=print*/, count));

-  if (COLUMNS_BATCH) {
-    transpose_batch<<<(NTT_SIZE * BATCH_SIZE + 256 - 1) / 256, 256>>>(
-      GpuOutputNew, GpuScalarsTransposed, BATCH_SIZE, NTT_SIZE);
-    CHK_IF_RETURN(cudaMemcpy(
-      GpuOutputNew, GpuScalarsTransposed, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToDevice));
-  }
-
  // verify
  CHK_IF_RETURN(
    cudaMemcpy(CpuOutputNew.get(), GpuOutputNew, NTT_SIZE * BATCH_SIZE * sizeof(test_data), cudaMemcpyDeviceToHost));
@@ -184,11 +153,10 @@ int main(int argc, char** argv)

  bool success = true;
  for (int i = 0; i < NTT_SIZE * BATCH_SIZE; i++) {
-    // if (i%64==0) printf("\n");
    if (CpuOutputNew[i] != CpuOutputOld[i]) {
      success = false;
      // std::cout << i << " ref " << CpuOutputOld[i] << " != " << CpuOutputNew[i] << std::endl;
-      // break;
+      break;
    } else {
      // std::cout << i << " ref " << CpuOutputOld[i] << " == " << CpuOutputNew[i] << std::endl;
      // break;
@@ -201,7 +169,5 @@ int main(int argc, char** argv)
  CHK_IF_RETURN(cudaFree(GpuOutputOld));
  CHK_IF_RETURN(cudaFree(GpuOutputNew));

-  ntt::release_domain<test_scalar>(ntt_config.ctx);
-
  return CHK_LAST();
 }
--- a/icicle/appUtils/ntt/thread_ntt.cu
+++ b/icicle/appUtils/ntt/thread_ntt.cu
@@ -4,12 +4,11 @@

 #include <stdio.h>
 #include <stdint.h>
-#include "gpu-utils/modifiers.cuh"
+#include "curves/curve_config.cuh"

 struct stage_metadata {
  uint32_t th_stride;
  uint32_t ntt_block_size;
-  uint32_t batch_id;
  uint32_t ntt_block_id;
  uint32_t ntt_inp_id;
 };
@@ -51,113 +50,116 @@ public:
  S WI[7];
  S WE[8];

-  DEVICE_INLINE void loadBasicTwiddles(S* basic_twiddles)
+  __device__ __forceinline__ void loadBasicTwiddles(S* basic_twiddles)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 3; i++) {
      WB[i] = basic_twiddles[i];
    }
  }

-  DEVICE_INLINE void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
+  __device__ __forceinline__ void loadBasicTwiddlesGeneric(S* basic_twiddles, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 3; i++) {
      WB[i] = basic_twiddles[inv ? i + 3 : i];
    }
  }

-  DEVICE_INLINE void loadInternalTwiddles64(S* data, bool stride)
+  __device__ __forceinline__ void loadInternalTwiddles64(S* data, bool stride)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      WI[i] = data[((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1)];
    }
  }

-  DEVICE_INLINE void loadInternalTwiddles32(S* data, bool stride)
+  __device__ __forceinline__ void loadInternalTwiddles32(S* data, bool stride)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      WI[i] = data[2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1)];
    }
  }

-  DEVICE_INLINE void loadInternalTwiddles16(S* data, bool stride)
+  __device__ __forceinline__ void loadInternalTwiddles16(S* data, bool stride)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      WI[i] = data[4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1)];
    }
  }

-  DEVICE_INLINE void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric64(S* data, bool stride, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      uint32_t exp = ((stride ? (threadIdx.x >> 3) : (threadIdx.x)) & 0x7) * (i + 1);
      WI[i] = data[(inv && exp) ? 64 - exp : exp]; // if exp = 0 we also take exp and not 64-exp
    }
  }

-  DEVICE_INLINE void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric32(S* data, bool stride, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      uint32_t exp = 2 * ((stride ? (threadIdx.x >> 4) : (threadIdx.x)) & 0x3) * (i + 1);
      WI[i] = data[(inv && exp) ? 64 - exp : exp];
    }
  }

-  DEVICE_INLINE void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
+  __device__ __forceinline__ void loadInternalTwiddlesGeneric16(S* data, bool stride, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 7; i++) {
      uint32_t exp = 4 * ((stride ? (threadIdx.x >> 5) : (threadIdx.x)) & 0x1) * (i + 1);
      WI[i] = data[(inv && exp) ? 64 - exp : exp];
    }
  }

-  DEVICE_INLINE void loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
+  __device__ __forceinline__ void
+  loadExternalTwiddles64(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
  {
    data += tw_order * s_meta.ntt_inp_id + (s_meta.ntt_block_id & (tw_order - 1));

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      WE[i] = data[8 * i * tw_order + (1 << tw_log_order + 6) - 1];
    }
  }

-  DEVICE_INLINE void loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
+  __device__ __forceinline__ void
+  loadExternalTwiddles32(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
  {
    data += tw_order * s_meta.ntt_inp_id * 2 + (s_meta.ntt_block_id & (tw_order - 1));

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        WE[4 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 5) - 1];
      }
    }
  }

-  DEVICE_INLINE void loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta)
+  __device__ __forceinline__ void
+  loadExternalTwiddles16(S* data, uint32_t tw_order, uint32_t tw_log_order, bool strided, stage_metadata s_meta)
  {
    data += tw_order * s_meta.ntt_inp_id * 4 + (s_meta.ntt_block_id & (tw_order - 1));

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        WE[2 * j + i] = data[(8 * i + j) * tw_order + (1 << tw_log_order + 4) - 1];
      }
    }
  }

-  DEVICE_INLINE void loadExternalTwiddlesGeneric64(
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric64(
    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      uint32_t exp = (s_meta.ntt_inp_id + 8 * i) * (s_meta.ntt_block_id & (tw_order - 1))
                     << (tw_log_size - tw_log_order - 6);
@@ -165,12 +167,12 @@ public:
    }
  }

-  DEVICE_INLINE void loadExternalTwiddlesGeneric32(
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric32(
    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        uint32_t exp = (s_meta.ntt_inp_id * 2 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
                       << (tw_log_size - tw_log_order - 5);
@@ -179,12 +181,12 @@ public:
    }
  }

-  DEVICE_INLINE void loadExternalTwiddlesGeneric16(
+  __device__ __forceinline__ void loadExternalTwiddlesGeneric16(
    S* data, uint32_t tw_order, uint32_t tw_log_order, stage_metadata s_meta, uint32_t tw_log_size, bool inv)
  {
-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        uint32_t exp = (s_meta.ntt_inp_id * 4 + 8 * i + j) * (s_meta.ntt_block_id & (tw_order - 1))
                       << (tw_log_size - tw_log_order - 4);
@@ -193,8 +195,8 @@ public:
    }
  }

-  DEVICE_INLINE void
-  loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void loadGlobalData(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
@@ -203,28 +205,14 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      X[i] = data[s_meta.th_stride * i * data_stride];
    }
  }

-  DEVICE_INLINE void loadGlobalDataColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void storeGlobalData(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
@@ -233,28 +221,14 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      data[s_meta.th_stride * i * data_stride] = X[i];
    }
  }

-  DEVICE_INLINE void storeGlobalDataColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
-    }
-  }
-
-  DEVICE_INLINE void
-  loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void loadGlobalData32(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
@@ -263,34 +237,17 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        X[4 * j + i] = data[(8 * i + j) * data_stride];
      }
    }
  }

-  DEVICE_INLINE void loadGlobalData32ColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void storeGlobalData32(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
@@ -299,34 +256,17 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        data[(8 * i + j) * data_stride] = X[4 * j + i];
      }
    }
  }

-  DEVICE_INLINE void storeGlobalData32ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void loadGlobalData16(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
@@ -335,34 +275,17 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        X[2 * j + i] = data[(8 * i + j) * data_stride];
      }
    }
  }

-  DEVICE_INLINE void loadGlobalData16ColumnBatch(
-    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
-  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
-      }
-    }
-  }
-
-  DEVICE_INLINE void
-  storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
+  __device__ __forceinline__ void storeGlobalData16(
+    E* data, uint32_t data_stride, uint32_t log_data_stride, uint32_t log_size, bool strided, stage_metadata s_meta)
  {
    if (strided) {
      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
@@ -371,49 +294,32 @@ public:
      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
    }

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        data[(8 * i + j) * data_stride] = X[2 * j + i];
      }
    }
  }

-  DEVICE_INLINE void storeGlobalData16ColumnBatch(
-    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
+  __device__ __forceinline__ void ntt4_2()
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
-              batch_size +
-            s_meta.batch_id;
-
-    UNROLL
-    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
-      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
-      }
-    }
-  }
-
-  DEVICE_INLINE void ntt4_2()
-  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 2; i++) {
      ntt4(X[4 * i], X[4 * i + 1], X[4 * i + 2], X[4 * i + 3]);
    }
  }

-  DEVICE_INLINE void ntt2_4()
+  __device__ __forceinline__ void ntt2_4()
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 4; i++) {
      ntt2(X[2 * i], X[2 * i + 1]);
    }
  }

-  DEVICE_INLINE void ntt2(E& X0, E& X1)
+  __device__ __forceinline__ void ntt2(E& X0, E& X1)
  {
    E T;

@@ -422,7 +328,7 @@ public:
    X0 = T;
  }

-  DEVICE_INLINE void ntt4(E& X0, E& X1, E& X2, E& X3)
+  __device__ __forceinline__ void ntt4(E& X0, E& X1, E& X2, E& X3)
  {
    E T;

@@ -440,7 +346,7 @@ public:
  }

  // rbo version
-  DEVICE_INLINE void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
+  __device__ __forceinline__ void ntt4rbo(E& X0, E& X1, E& X2, E& X3)
  {
    E T;

@@ -457,7 +363,7 @@ public:
    X3 = T - X3;
  }

-  DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
+  __device__ __forceinline__ void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
  {
    E T;

@@ -497,7 +403,7 @@ public:
    X4 = X4 - T;
  }

-  DEVICE_INLINE void ntt8win()
+  __device__ __forceinline__ void ntt8win()
  {
    E T;

@@ -539,12 +445,12 @@ public:
    X[4] = X[4] - T;
  }

-  DEVICE_INLINE void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData64Columns8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
    uint32_t column_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 64 + i * 8 + column_id] = X[i];
@@ -554,12 +460,12 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData64Rows8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x7 : threadIdx.x >> 3;
    uint32_t row_id = stride ? threadIdx.x >> 3 : threadIdx.x & 0x7;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 64 + row_id * 8 + i] = X[i];
@@ -569,12 +475,12 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData32Columns8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
    uint32_t column_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 32 + i * 4 + column_id] = X[i];
@@ -584,12 +490,12 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData32Rows8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
    uint32_t row_id = stride ? threadIdx.x >> 4 : threadIdx.x & 0x3;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 32 + row_id * 8 + i] = X[i];
@@ -599,14 +505,14 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData32Columns4_2(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
    uint32_t column_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        if (store) {
          shmem[ntt_id * 32 + i * 8 + column_id + j] = X[4 * j + i];
@@ -617,14 +523,14 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData32Rows4_2(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0xf : threadIdx.x >> 2;
    uint32_t row_id = (stride ? threadIdx.x >> 4 : threadIdx.x & 0x3) * 2;

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 2; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 4; i++) {
        if (store) {
          shmem[ntt_id * 32 + row_id * 4 + 4 * j + i] = X[4 * j + i];
@@ -635,12 +541,12 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData16Columns8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
    uint32_t column_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 16 + i * 2 + column_id] = X[i];
@@ -650,12 +556,12 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData16Rows8(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
    uint32_t row_id = stride ? threadIdx.x >> 5 : threadIdx.x & 0x1;

-    UNROLL
+#pragma unroll
    for (uint32_t i = 0; i < 8; i++) {
      if (store) {
        shmem[ntt_id * 16 + row_id * 8 + i] = X[i];
@@ -665,14 +571,14 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData16Columns2_4(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
    uint32_t column_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        if (store) {
          shmem[ntt_id * 16 + i * 8 + column_id + j] = X[2 * j + i];
@@ -683,14 +589,14 @@ public:
    }
  }

-  DEVICE_INLINE void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
+  __device__ __forceinline__ void SharedData16Rows2_4(E* shmem, bool store, bool high_bits, bool stride)
  {
    uint32_t ntt_id = stride ? threadIdx.x & 0x1f : threadIdx.x >> 1;
    uint32_t row_id = (stride ? threadIdx.x >> 5 : threadIdx.x & 0x1) * 4;

-    UNROLL
+#pragma unroll
    for (uint32_t j = 0; j < 4; j++) {
-      UNROLL
+#pragma unroll
      for (uint32_t i = 0; i < 2; i++) {
        if (store) {
          shmem[ntt_id * 16 + row_id * 2 + 2 * j + i] = X[2 * j + i];
@@ -701,17 +607,17 @@ public:
    }
  }

-  DEVICE_INLINE void twiddlesInternal()
+  __device__ __forceinline__ void twiddlesInternal()
  {
-    UNROLL
+#pragma unroll
    for (int i = 1; i < 8; i++) {
      X[i] = X[i] * WI[i - 1];
    }
  }

-  DEVICE_INLINE void twiddlesExternal()
+  __device__ __forceinline__ void twiddlesExternal()
  {
-    UNROLL
+#pragma unroll
    for (int i = 0; i < 8; i++) {
      X[i] = X[i] * WE[i];
    }
--- a/icicle/appUtils/poseidon/.gitignore
+++ b/icicle/appUtils/poseidon/.gitignore
--- a/icicle/appUtils/poseidon/Makefile
+++ b/icicle/appUtils/poseidon/Makefile
@@ -1,3 +1,3 @@
 test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
-	nvcc -o test_poseidon -I../../include -DFIELD_ID=2 -DCURVE_ID=2 test.cu
+	nvcc -o test_poseidon -I. -I../.. test.cu
 	./test_poseidon
--- a/icicle/appUtils/poseidon/constants.cu
+++ b/icicle/appUtils/poseidon/constants.cu
@@ -1,21 +1,20 @@
-#include "poseidon/poseidon.cuh"
+#include "poseidon.cuh"

 /// These are pre-calculated constants for different curves
-#include "fields/id.h"
-#if FIELD_ID == BN254
-#include "poseidon/constants/bn254_poseidon.h"
+#if CURVE_ID == BN254
+#include "appUtils/poseidon/constants/bn254_poseidon.h"
 using namespace poseidon_constants_bn254;
-#elif FIELD_ID == BLS12_381
-#include "poseidon/constants/bls12_381_poseidon.h"
+#elif CURVE_ID == BLS12_381
+#include "appUtils/poseidon/constants/bls12_381_poseidon.h"
 using namespace poseidon_constants_bls12_381;
-#elif FIELD_ID == BLS12_377
-#include "poseidon/constants/bls12_377_poseidon.h"
+#elif CURVE_ID == BLS12_377
+#include "appUtils/poseidon/constants/bls12_377_poseidon.h"
 using namespace poseidon_constants_bls12_377;
-#elif FIELD_ID == BW6_761
-#include "poseidon/constants/bw6_761_poseidon.h"
+#elif CURVE_ID == BW6_761
+#include "appUtils/poseidon/constants/bw6_761_poseidon.h"
 using namespace poseidon_constants_bw6_761;
-#elif FIELD_ID == GRUMPKIN
-#include "poseidon/constants/grumpkin_poseidon.h"
+#elif CURVE_ID == GRUMPKIN
+#include "appUtils/poseidon/constants/grumpkin_poseidon.h"
 using namespace poseidon_constants_grumpkin;
 #endif

@@ -99,21 +98,21 @@ namespace poseidon {
    return CHK_LAST();
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, CreateOptimizedPoseidonConstants)(
    int arity,
    int full_rounds_half,
    int partial_rounds,
-    const scalar_t* constants,
+    const curve_config::scalar_t* constants,
    device_context::DeviceContext& ctx,
-    PoseidonConstants<scalar_t>* poseidon_constants)
+    PoseidonConstants<curve_config::scalar_t>* poseidon_constants)
  {
-    return create_optimized_poseidon_constants<scalar_t>(
+    return create_optimized_poseidon_constants<curve_config::scalar_t>(
      arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon_constants_cuda)(
-    int arity, device_context::DeviceContext& ctx, PoseidonConstants<scalar_t>* constants)
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, InitOptimizedPoseidonConstants)(
+    int arity, device_context::DeviceContext& ctx, PoseidonConstants<curve_config::scalar_t>* constants)
  {
-    return init_optimized_poseidon_constants<scalar_t>(arity, ctx, constants);
+    return init_optimized_poseidon_constants<curve_config::scalar_t>(arity, ctx, constants);
  }
 } // namespace poseidon
--- a/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_377_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bls12_381_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bn254_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bn254_poseidon.h
--- a/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
+++ b/icicle/appUtils/poseidon/constants/bw6_761_poseidon.h
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ImmanuelSegol	b5364c24dd	refactor	2024-02-28 11:41:09 -04:00
ImmanuelSegol	c2b73aee8d	refactor	2024-02-28 11:37:25 -04:00
ImmanuelSegol	49663d89d3	refactor	2024-02-28 11:29:29 -04:00
ImmanuelSegol	dd509f095b	refactor	2024-02-28 11:27:33 -04:00
ImmanuelSegol	9449ffd7cb	refactor	2024-02-28 11:19:59 -04:00