refactor

2026-01-12 17:07:59 -05:00 · 2024-02-28 11:41:09 -04:00 · 2024-02-28 11:37:25 -04:00 · 2024-02-28 11:29:29 -04:00 · 2024-02-28 11:27:33 -04:00 · 2024-02-28 11:19:59 -04:00
116 changed files with 919 additions and 3014 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -1,10 +1,10 @@
 golang:
-  - wrappers/golang/**/*.go
-  - wrappers/golang/**/*.h
-  - wrappers/golang/**/*.tmpl
+  - wrappers/golang/**/*.go'
+  - wrappers/golang/**/*.h'
+  - wrappers/golang/**/*.tmpl'
  - go.mod
 rust:
-  - wrappers/rust/**/*
+  - wrappers/rust
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -1,39 +0,0 @@
-name: Check Changed Files
-
-on:
-  workflow_call:
-    outputs:
-      golang:
-        description: "Flag for if GoLang files changed"
-        value: ${{ jobs.check-changed-files.outputs.golang }}
-      rust:
-        description: "Flag for if Rust files changed"
-        value: ${{ jobs.check-changed-files.outputs.rust }}
-      cpp_cuda:
-        description: "Flag for if C++/CUDA files changed"
-        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
-
-jobs:
-  check-changed-files:
-    name: Check Changed Files
-    runs-on: ubuntu-22.04
-    outputs:
-      golang: ${{ steps.changed_files.outputs.golang }}
-      rust: ${{ steps.changed_files.outputs.rust }}
-      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Get all changed files
-      id: changed-files-yaml
-      uses: tj-actions/changed-files@v39
-      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
-      with:
-        files_yaml_from_source_file: .github/changed-files.yml
-    - name: Run Changed Files script
-      id: changed_files
-      # https://github.com/tj-actions/changed-files#outputs-
-      run: |
-        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
-        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -11,7 +11,7 @@ jobs:
    name: Check Spelling
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - uses: codespell-project/actions-codespell@v2
        with:
          # https://github.com/codespell-project/actions-codespell?tab=readme-ov-file#parameter-skip
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -1,52 +0,0 @@
-name: C++/CUDA
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-  push:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Check clang-format
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
-
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Build
-      working-directory: ./icicle
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: |
-        mkdir -p build
-        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -DG2_DEFINED=ON -S . -B build
-        cmake --build build
-    - name: Run C++ Tests
-      working-directory: ./icicle/build
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ctest
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/**'
+      - 'docs/*'

 permissions:
  contents: write
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -21,19 +21,14 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  run-examples:
+jobs:  
+  test-examples:
    runs-on: [self-hosted, Linux, X64, icicle, examples]
-    needs: check-changed-files
    steps:
    - name: Checkout
-      uses: actions/checkout@v4
+      uses: actions/checkout@v2
    - name: c++ examples
      working-directory: ./examples/c++
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +42,6 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
-      if: needs.check-changed-files.outputs.rust == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/golang.yml
+++ b/.github/workflows/golang.yml
@@ -1,119 +0,0 @@
-name: GoLang
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-  push:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Check gofmt
-      if: needs.check-changed-files.outputs.golang == 'true'
-      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
-
-  build-linux:
-    name: Build on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    strategy:
-      matrix:
-        curve: [bn254, bls12_381, bls12_377, bw6_761]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Build
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      run: ./build.sh ${{ matrix.curve }} ON # builds a single curve with G2 enabled
-    - name: Upload ICICLE lib artifacts
-      uses: actions/upload-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        name: icicle-builds-${{ matrix.curve }}-${{ github.workflow }}-${{ github.sha }}
-        path: icicle/build/libingo_${{ matrix.curve }}.a
-        retention-days: 1
-  
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-linux]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Setup go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.20.0'
-    - name: Download ICICLE lib artifacts
-      uses: actions/download-artifact@v4
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      with:
-        path: ./icicle/build/
-        merge-multiple: true
-    - name: Run Tests
-      working-directory: ./wrappers/golang
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # -count ensures the test results are not cached
-      # -p controls the number of programs that can be run in parallel
-      run: |
-        export CPATH=$CPATH:/usr/local/cuda/include
-        go test --tags=g2 ./... -count=1 -failfast -p 2 -timeout 60m
-  
-  # TODO: bw6 on windows requires more memory than the standard runner has
-  # Add a large runner and then enable this job
-  # build-windows:
-  #   name: Build on Windows
-  #   runs-on: windows-2022
-  #   needs: [check-changed-files, check-format]
-  #   strategy:
-  #     matrix:
-  #       curve: [bn254, bls12_381, bls12_377, bw6_761]
-  #   steps:     
-  #   - name: Checkout Repo
-  #     uses: actions/checkout@v4
-  #   - name: Setup go
-  #     uses: actions/setup-go@v5
-  #     with:
-  #       go-version: '1.20.0'
-  #   - name: Download and Install Cuda
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     id: cuda-toolkit
-  #     uses: Jimver/cuda-toolkit@v0.2.11
-  #     with:
-  #       cuda: '12.0.0'
-  #       method: 'network'
-  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-  #   - name: Build libs
-  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-  #     working-directory: ./wrappers/golang
-  #     env:
-  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-  #     shell: pwsh
-  #     run: ./build.ps1 ${{ matrix.curve }} ON # builds a single curve with G2 enabled
--- a/.github/workflows/main-build.yml
+++ b/.github/workflows/main-build.yml
@@ -0,0 +1,119 @@
+name: Build
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+  push:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  build-rust-linux:
+    name: Build Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build Rust
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-rust-windows:
+    name: Build Rust on Windows
+    runs-on: windows-2022
+    needs: check-changed-files
+    steps:     
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Download and Install Cuda
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.11
+      with:
+        cuda: '12.0.0'
+        method: 'network'
+        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+    - name: Build Rust Targets
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      env:
+        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+      # Building from the root workspace will build all members of the workspace by default
+      run: cargo build --release --verbose
+
+  build-golang-linux:
+    name: Build Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      working-directory: ./wrappers/golang
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh ${{ matrix.curve }} ON
+
+  # TODO: Add once Golang make file supports building for Windows
+  # build-golang-windows:
+  #   name: Build Golang on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v3
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust"]'
+  #   - name: Build cpp libs
+  #     if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     run: make all
+  #     working-directory: ./goicicle
--- a/.github/workflows/main-format.yml
+++ b/.github/workflows/main-format.yml
@@ -0,0 +1,47 @@
+name: Format
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  formatting-rust:
+    name: Check Rust Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check rustfmt
+      working-directory: ./wrappers/rust
+      # "-name tagret -prune" removes searching in any directory named "target"
+      # Formatting by single file is necessary due to generated files not being present
+      # before building the project.
+      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
+      # causing rustfmt to fail.
+      run: if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
+    # - name: Check clippy
+    #   run: cargo clippy --no-deps --all-features --all-targets
+
+  formatting-golang:
+    name: Check Golang Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check gofmt
+      run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
+
+  formatting-cpp-cuda:
+    name: Check C++/CUDA Code Formatting
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Check clang-format
+      run: if [[ $(find ./ \( -path ./icicle/build -prune -o -path ./**/target -prune -o -path ./examples -prune \) -iname *.h -or -iname *.cuh -or -iname *.cu -or -iname *.c -or -iname *.cpp | xargs clang-format --dry-run -ferror-limit=1 -style=file 2>&1) ]]; then echo "Please run clang-format"; exit 1; fi
--- a/.github/workflows/main-test.yml
+++ b/.github/workflows/main-test.yml
@@ -0,0 +1,99 @@
+name: Test
+
+on:
+  pull_request:
+    branches:
+      - main
+      - dev
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  ARCH_TYPE: native
+
+jobs:
+  check-changed-files:
+    name: Check Changed Files
+    runs-on: ubuntu-22.04
+    outputs:
+      golang: ${{ steps.changed_files.outputs.golang }}
+      rust: ${{ steps.changed_files.outputs.rust }}
+      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Get all changed files
+      id: changed-files-yaml
+      uses: tj-actions/changed-files@v39
+      # https://github.com/tj-actions/changed-files#input_files_yaml_from_source_file
+      with:
+        files_yaml_from_source_file: .github/changed-files.yml
+    - name: Run Changed Files script
+      id: changed_files
+      # https://github.com/tj-actions/changed-files#outputs-
+      run: |
+        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+
+  test-rust-linux:
+    name: Test Rust on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Run Rust Tests
+      working-directory: ./wrappers/rust
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # Running tests from the root workspace will run all workspace members' tests by default
+      # We need to limit the number of threads to avoid running out of memory on weaker machines
+      run: cargo test --release --verbose --features=g2 -- --test-threads=2
+
+  test-cpp-linux:
+    name: Test C++ on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    strategy:
+      matrix:
+        curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build C++
+      working-directory: ./icicle
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        mkdir -p build
+        cmake -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCURVE=${{ matrix.curve }} -S . -B build
+        cmake --build build
+    - name: Run C++ Tests
+      working-directory: ./icicle/build
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: ctest
+  
+  test-golang-linux:
+    name: Test Golang on Linux
+    runs-on: [self-hosted, Linux, X64, icicle]
+    needs: check-changed-files
+    # strategy:
+    #   matrix:
+    #     curve: [bn254, bls12_381, bls12_377, bw6_761]
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Build CUDA libs
+      working-directory: ./wrappers/golang
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      # builds all curves with g2 ON
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        ./build.sh all ON
+    - name: Run Golang Tests
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+      run: |
+        export CPATH=$CPATH:/usr/local/cuda/include
+        go test --tags=g2 ./... -count=1 -timeout 60m
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,34 +0,0 @@
-name: Release
-
-on:
-  workflow_dispatch:
-    inputs:
-      releaseType:
-        description: 'Release type'
-        required: true
-        default: 'minor'
-        type: choice
-        options:
-          - patch
-          - minor
-          - major
-
-jobs:
-  release:
-    name: Release
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Bump rust crate versions, commit, and tag
-        working-directory: wrappers/rust
-        # https://github.com/pksunkara/cargo-workspaces?tab=readme-ov-file#version
-        run: |
-          cargo install cargo-workspaces
-          cargo workspaces version ${{ inputs.releaseType }} -y --no-individual-tags -m "Bump rust crates' version"
-      - name: Create draft release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          LATEST_TAG=$(git describe --tags --abbrev=0)
-          gh release create $LATEST_TAG --generate-notes -d --verify-tag -t "Release $LATEST_TAG"
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -1,87 +0,0 @@
-name: Rust
-
-on:
-  pull_request:
-    branches:
-      - main
-      - dev
-  push:
-    branches:
-      - main
-      - dev
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changed-files:
-    uses: ./.github/workflows/check-changed-files.yml
-
-  check-format:
-    name: Check Code Format
-    runs-on: ubuntu-22.04
-    needs: check-changed-files
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - name: Check rustfmt
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      working-directory: ./wrappers/rust
-      # "-name target -prune" removes searching in any directory named "target"
-      # Formatting by single file is necessary due to generated files not being present
-      # before building the project.
-      # e.g. icicle-cuda-runtime/src/bindings.rs is generated and icicle-cuda-runtime/src/lib.rs includes that module
-      # causing rustfmt to fail.
-      run: if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]]; then echo "Please run cargo fmt"; exit 1; fi
-
-  build-linux:
-    name: Build on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, check-format]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Build
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
-  
-  test-linux:
-    name: Test on Linux
-    runs-on: [self-hosted, Linux, X64, icicle]
-    needs: [check-changed-files, build-linux]
-    steps:
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Run tests
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      # Running tests from the root workspace will run all workspace members' tests by default
-      # We need to limit the number of threads to avoid running out of memory on weaker machines
-      run: cargo test --release --verbose --features=g2 -- --test-threads=2
-
-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -9,7 +9,7 @@ on:

 jobs:
  test-deploy:
-    name: Test deployment of docs website
+    name: Test deployment of docs webiste
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # ICICLE

-<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>
+**<div align="center">ICICLE is a library for ZK acceleration using CUDA-enabled GPUs.</div>**

 <p align="center">
  <img alt="ICICLE" width="300" height="300" src="https://user-images.githubusercontent.com/2446179/223707486-ed8eb5ab-0616-4601-8557-12050df8ccf7.png"/>
@@ -13,12 +13,8 @@
    <img src="https://img.shields.io/twitter/follow/Ingo_zk?style=social&logo=twitter" alt="Follow us on Twitter">
  </a>
  <img src="https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue" alt="Machines running ICICLE">
-  <a href="https://github.com/ingonyama-zk/icicle/releases">
-    <img src="https://img.shields.io/github/v/release/ingonyama-zk/icicle" alt="GitHub Release">
-  </a>
 </p>

-
 ## Background

 Zero Knowledge Proofs (ZKPs) are considered one of the greatest achievements of modern cryptography. Accordingly, ZKPs are expected to disrupt a number of industries and will usher in an era of trustless and privacy preserving services and infrastructure.
--- a/docs/docs/icicle/golang-bindings.md
+++ b/docs/docs/icicle/golang-bindings.md
@@ -1,105 +1,3 @@
 # Golang bindings

-Golang bindings allow you to use ICICLE as a golang library.
-The source code for all Golang libraries can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang).
-
-The Golang bindings are comprised of multiple packages.
-
-[`core`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/core) which defines all shared methods and structures, such as configuration structures, or memory slices.
-
-[`cuda-runtime`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/cuda_runtime) which defines abstractions for CUDA methods for allocating memory, initializing and managing streams, and `DeviceContext` which enables users to define and keep track of devices.
-
-Each curve has its own package which you can find [here](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves). If your project uses BN254 you only need to install that single package named [`bn254`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/curves/bn254).
-
-## Using ICICLE Golang bindings in your project
-
-To add ICICLE to your `go.mod` file.
-
-```bash
-go get github.com/ingonyama-zk/icicle
-```
-
-If you want to specify a specific branch
-
-```bash
-go get github.com/ingonyama-zk/icicle@<branch_name>
-```
-
-For a specific commit
-
-```bash
-go get github.com/ingonyama-zk/icicle@<commit_id>
-```
-
-To build the shared libraries you can run this script:
-
-```
-./build <curve> [G2_enabled]
-
-curve - The name of the curve to build or "all" to build all curves
-G2_enabled - Optional - To build with G2 enabled 
-```
-
-For example if you want to build all curves with G2 enabled you would run:
-
-```bash
-./build.sh all ON
-```
-
-If you are interested in building a specific curve you would run:
-
-```bash
-./build.sh bls12_381 ON
-```
-
-Now you can import ICICLE into your project
-
-```golang
-import (
-    "github.com/stretchr/testify/assert"
-    "testing"
-
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-...
-```
-
-## Running tests
-
-To run all tests, for all curves:
-
-```bash
-go test --tags=g2 ./... -count=1
-```
-
-If you dont want to include g2 tests then drop `--tags=g2`.
-
-If you wish to run test for a specific curve:
-
-```bash
-go test <path_to_curve> -count=1
-```
-
-## How do Golang bindings work?
-
-The libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
-
-1. These libraries (named `libingo_<curve>.a`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE.
-
-2. In your Go project, you can use `cgo` to link these libraries. Here's a basic example on how you can use `cgo` to link these libraries:
-
-```go
-/*
-#cgo LDFLAGS: -L/path/to/shared/libs -lingo_bn254
-#include "icicle.h" // make sure you use the correct header file(s)
-*/
-import "C"
-
-func main() {
-    // Now you can call the C functions from the ICICLE libraries.
-    // Note that C function calls are prefixed with 'C.' in Go code.
-}
-```
-
-Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+Golang is WIP in v1, coming soon. Please checkout a previous [release v0.1.0](https://github.com/ingonyama-zk/icicle/releases/tag/v0.1.0) for golang bindings.
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -1,200 +0,0 @@
-# MSM
-
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
-## MSM Example
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func Main() {
-    // Obtain the default MSM configuration.
-    cfg := GetDefaultMSMConfig()
-    
-    // Define the size of the problem, here 2^18.
-    size := 1 << 18
-
-    // Generate scalars and points for the MSM operation.
-    scalars := GenerateScalars(size)
-    points := GenerateAffinePoints(size)
-
-    // Create a CUDA stream for asynchronous operations.
-    stream, _ := cr.CreateStream()
-    var p Projective
-    
-    // Allocate memory on the device for the result of the MSM operation.
-    var out core.DeviceSlice
-    _, e := out.MallocAsync(p.Size(), p.Size(), stream)
-
-    if e != cr.CudaSuccess {
-        panic(e)
-    }
-    
-    // Set the CUDA stream in the MSM configuration.
-    cfg.Ctx.Stream = &stream
-    cfg.IsAsync = true
-    
-    // Perform the MSM operation.
-    e = Msm(scalars, points, &cfg, out)
-    
-    if e != cr.CudaSuccess {
-        panic(e)
-    }
-    
-    // Allocate host memory for the results and copy the results from the device.
-    outHost := make(core.HostSlice[Projective], 1)
-    cr.SynchronizeStream(&stream)
-    outHost.CopyFromDevice(&out)
-    
-    // Free the device memory allocated for the results.
-    out.Free()
-}
-```
-
-## MSM Method
-
-```go
-func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *core.MSMConfig, results core.HostOrDeviceSlice) cr.CudaError
-```
-
-### Parameters
-
- **scalars**: A slice containing the scalars for multiplication. It can reside either in host memory or device memory.
- **points**: A slice containing the points to be multiplied with scalars. Like scalars, these can also be in host or device memory.
- **cfg**: A pointer to an `MSMConfig` object, which contains various configuration options for the MSM operation.
- **results**: A slice where the results of the MSM operation will be stored. This slice can be in host or device memory.
-
-### Return Value
-
- **CudaError**: Returns a CUDA error code indicating the success or failure of the MSM operation.
-
-## MSMConfig
-
-The `MSMConfig` structure holds configuration parameters for the MSM operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
-
-```go
-type MSMConfig struct {
-    Ctx cr.DeviceContext
-    PrecomputeFactor int32
-    C int32
-    Bitsize int32
-    LargeBucketFactor int32
-    batchSize int32
-    areScalarsOnDevice bool
-    AreScalarsMontgomeryForm bool
-    arePointsOnDevice bool
-    ArePointsMontgomeryForm bool
-    areResultsOnDevice bool
-    IsBigTriangle bool
-    IsAsync bool
-}
-```
-
-### Fields
-
- **Ctx**: Device context containing details like device id and stream.
- **PrecomputeFactor**: Controls the number of extra points to pre-compute.
- **C**: Window bitsize, a key parameter in the "bucket method" for MSM.
- **Bitsize**: Number of bits of the largest scalar.
- **LargeBucketFactor**: Sensitivity to frequently occurring buckets.
- **batchSize**: Number of results to compute in one batch.
- **areScalarsOnDevice**: Indicates if scalars are located on the device.
- **AreScalarsMontgomeryForm**: True if scalars are in Montgomery form.
- **arePointsOnDevice**: Indicates if points are located on the device.
- **ArePointsMontgomeryForm**: True if point coordinates are in Montgomery form.
- **areResultsOnDevice**: Indicates if results are stored on the device.
- **IsBigTriangle**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **IsAsync**: If true, runs MSM asynchronously.
-
-### Default Configuration
-
-Use `GetDefaultMSMConfig` to obtain a default configuration, which can then be customized as needed.
-
-```go
-func GetDefaultMSMConfig() MSMConfig
-```
-
-
-## How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
-
-```go
-...
-
-// Obtain the default MSM configuration.
-cfg := GetDefaultMSMConfig()
-
-cfg.Ctx.IsBigTriangle = true
-
-...
-```
-
-## How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `MSM` function.
-
-The number of results is interpreted from the size of `var out core.DeviceSlice`. Thus its important when allocating memory for `var out core.DeviceSlice` to make sure that you are allocating `<number of results> X <size of a single point>`.
-
-```go
-... 
-
-batchSize := 3
-var p G2Projective
-var out core.DeviceSlice
-out.Malloc(batchSize*p.Size(), p.Size())
-
-...
-```
-
-## Support for G2 group
-
-To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled.
-
-```bash
-./build.sh bls12_381 ON
-```
-
-Now when importing `icicle`, you should have access to G2 features.
-
-```go
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-)
-```
-
-These features include `G2Projective` and `G2Affine` points as well as a `G2Msm` method.
-
-```go
-...
-
-cfg := GetDefaultMSMConfig()
-size := 1 << 12
-batchSize := 3
-totalSize := size * batchSize
-scalars := GenerateScalars(totalSize)
-points := G2GenerateAffinePoints(totalSize)
-
-var p G2Projective
-var out core.DeviceSlice
-out.Malloc(batchSize*p.Size(), p.Size())
-G2Msm(scalars, points, &cfg, out)
-
-...
-```
-
-`G2Msm` works the same way as normal MSM, the difference is that it uses G2 Points.
-
-Additionally when you are building your application make sure to use the g2 feature flag
-
-```bash
-go build -tags=g2
-```
--- a/docs/docs/icicle/golang-bindings/ntt.md
+++ b/docs/docs/icicle/golang-bindings/ntt.md
@@ -1,100 +0,0 @@
-# NTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`
-
-## NTT Example
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func Main() {
-    // Obtain the default NTT configuration with a predefined coset generator.
-    cfg := GetDefaultNttConfig()
-    
-    // Define the size of the input scalars.
-    size := 1 << 18
-
-    // Generate scalars for the NTT operation.
-    scalars := GenerateScalars(size)
-
-    // Set the direction of the NTT (forward or inverse).
-    dir := core.KForward
-
-    // Allocate memory for the results of the NTT operation.
-    results := make(core.HostSlice[ScalarField], size)
-
-    // Perform the NTT operation.
-    err := Ntt(scalars, dir, &cfg, results)
-    if err != cr.CudaSuccess {
-        panic("NTT operation failed")
-    }
-}
-```
-
-## NTT Method
-
-```go
-func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError
-```
-
-### Parameters
-
- **scalars**: A slice containing the input scalars for the transform. It can reside either in host memory or device memory.
- **dir**: The direction of the NTT operation (`KForward` or `KInverse`).
- **cfg**: A pointer to an `NTTConfig` object, containing configuration options for the NTT operation.
- **results**: A slice where the results of the NTT operation will be stored. This slice can be in host or device memory.
-
-### Return Value
-
- **CudaError**: Returns a CUDA error code indicating the success or failure of the NTT operation.
-
-## NTT Configuration (NTTConfig)
-
-The `NTTConfig` structure holds configuration parameters for the NTT operation, allowing customization of its behavior to optimize performance based on the specifics of your protocol.
-
-```go
-type NTTConfig[T any] struct {
-    Ctx cr.DeviceContext
-    CosetGen T
-    BatchSize int32
-    Ordering Ordering
-    areInputsOnDevice  bool
-    areOutputsOnDevice bool
-    IsAsync bool
-}
-```
-
-### Fields
-
- **Ctx**: Device context containing details like device ID and stream ID.
- **CosetGen**: Coset generator used for coset (i)NTTs, defaulting to no coset being used.
- **BatchSize**: The number of NTTs to compute in one operation, defaulting to 1.
- **Ordering**: Ordering of inputs and outputs (`KNN`, `KNR`, `KRN`, `KRR`, `KMN`, `KNM`), affecting how data is arranged.
- **areInputsOnDevice**: Indicates if input scalars are located on the device.
- **areOutputsOnDevice**: Indicates if results are stored on the device.
- **IsAsync**: Controls whether the NTT operation runs asynchronously.
-
-### Default Configuration
-
-Use `GetDefaultNTTConfig` to obtain a default configuration, customizable as needed.
-
-```go
-func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T]
-```
-
-### Initializing the NTT Domain
-
-Before performing NTT operations, it's necessary to initialize the NTT domain; it only needs to be called once per GPU since the twiddles are cached.
-
-```go
-func InitDomain(primitiveRoot ScalarField, ctx cr.DeviceContext, fastTwiddles bool) core.IcicleError
-```
-
-This function initializes the domain with a given primitive root, optionally using fast twiddle factors to optimize the computation.
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -1,132 +0,0 @@
-# Vector Operations
-
-## Overview
-
-The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
-
-## Example
-
-### Vector addition
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
-
-    // Perform vector addition
-    err := VecOp(a, b, out, cfg, core.Add)
-    if err != cr.CudaSuccess {
-        panic("Vector addition failed")
-    }
-}
-```
-
-### Vector Subtraction
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
-
-    // Perform vector subtraction
-    err := VecOp(a, b, out, cfg, core.Sub)
-    if err != cr.CudaSuccess {
-        panic("Vector subtraction failed")
-    }
-}
-```
-
-### Vector Multiplication
-
-```go
-package main
-
-import (
-    "github.com/ingonyama-zk/icicle/wrappers/golang/core"
-    cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func main() {
-    testSize := 1 << 12
-    a := GenerateScalars(testSize)
-    b := GenerateScalars(testSize)
-    out := make(core.HostSlice[ScalarField], testSize)
-    cfg := core.DefaultVecOpsConfig()
-
-    // Perform vector multiplication
-    err := VecOp(a, b, out, cfg, core.Mul)
-    if err != cr.CudaSuccess {
-        panic("Vector multiplication failed")
-    }
-}
-```
-
-## VecOps Method
-
-```go
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError)
-```
-
-### Parameters
-
- **a**: The first input vector.
- **b**: The second input vector.
- **out**: The output vector where the result of the operation will be stored.
- **config**: A `VecOpsConfig` object containing various configuration options for the vector operations.
- **op**: The operation to perform, specified as one of the constants (`Sub`, `Add`, `Mul`) from the `VecOps` type.
-
-### Return Value
-
- **CudaError**: Returns a CUDA error code indicating the success or failure of the vector operation.
-
-## VecOpsConfig
-
-The `VecOpsConfig` structure holds configuration parameters for the vector operations, allowing customization of its behavior.
-
-```go
-type VecOpsConfig struct {
-    Ctx cr.DeviceContext
-    isAOnDevice bool
-    isBOnDevice bool
-    isResultOnDevice bool
-    IsResultMontgomeryForm bool
-    IsAsync bool
-}
-```
-
-### Fields
-
- **Ctx**: Device context containing details like device ID and stream ID.
- **isAOnDevice**: Indicates if vector `a` is located on the device.
- **isBOnDevice**: Indicates if vector `b` is located on the device.
- **isResultOnDevice**: Specifies where the result vector should be stored (device or host memory).
- **IsResultMontgomeryForm**: Determines if the result vector should be in Montgomery form.
- **IsAsync**: Controls whether the vector operation runs asynchronously.
-
-### Default Configuration
-
-Use `DefaultVecOpsConfig` to obtain a default configuration, customizable as needed.
-
-```go
-func DefaultVecOpsConfig() VecOpsConfig
-```
--- a/docs/docs/icicle/overview.md
+++ b/docs/docs/icicle/overview.md
@@ -1,8 +1,8 @@
 # What is ICICLE?

-[![GitHub Release](https://img.shields.io/github/v/release/ingonyama-zk/icicle)](https://github.com/ingonyama-zk/icicle/releases)
+[![Static Badge](https://img.shields.io/badge/Latest-v1.4.0-8a2be2)](https://github.com/ingonyama-zk/icicle/releases)

-![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-blue)
+![Static Badge](https://img.shields.io/badge/Machines%20running%20ICICLE-544-lightblue)



--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -49,17 +49,13 @@ Accelerating MSM is crucial to a ZK protocol's performance due to the [large per

 You can learn more about how MSMs work from this [video](https://www.youtube.com/watch?v=Bl5mQA7UL2I) and from our resource list on [Ingopedia](https://www.ingonyama.com/ingopedia/msm).

+# Using MSM
+
 ## Supported curves

 MSM supports the following curves:

-`bls12-377`, `bls12-381`, `bn254`, `bw6-761`, `grumpkin`
-
-
-## Supported Bindings
-
- [Golang](../golang-bindings/msm.md)
- [Rust](../rust-bindings//msm.md)
+`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`

 ## Supported algorithms

@@ -83,6 +79,25 @@ Large Triangle Accumulation is a method for optimizing MSM which focuses on redu

 The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a <b>large batch of smaller MSM computations</b>.

+
+### How do I toggle between the supported algorithms?
+
+When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
+
+```rust
+...
+
+let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+
+// is_big_triangle will determine which algorithm to use 
+cfg_bls12377.is_big_triangle = true;
+
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+...
+```
+
+You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
+
 ## MSM Modes

 ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
@@ -94,3 +109,54 @@ Batch MSM allows you to run many MSMs with a single API call, Single MSM will la
 This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs leveraging the parallel processing capabilities of GPUs.

 Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
+
+### How do I toggle between MSM modes?
+
+Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
+
+```rust
+...
+
+let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
+
+...
+```
+
+In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
+
+
+In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
+
+```rust
+...
+
+let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
+msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
+
+...
+```
+
+Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
+
+
+## Support for G2 group
+
+MSM also supports G2 group. 
+
+Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
+
+```rust
+... 
+
+let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
+let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
+let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
+let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
+
+msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
+
+...
+```
+
+Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/primitives/ntt.md
+++ b/docs/docs/icicle/primitives/ntt.md
@@ -28,10 +28,6 @@ NTT supports the following curves:

 `bls12-377`, `bls12-381`, `bn-254`, `bw6-761`

-## Supported Bindings
-
- [Golang](../golang-bindings/ntt.md)
- [Rust](../rust-bindings/ntt.md)

 ### Examples

@@ -39,6 +35,87 @@ NTT supports the following curves:

 - [C++ API examples](https://github.com/ingonyama-zk/icicle/blob/d84ffd2679a4cb8f8d1ac2ad2897bc0b95f4eeeb/examples/c%2B%2B/ntt/example.cu#L1)

+## NTT API overview
+
+```rust
+pub fn ntt<F>(
+    input: &HostOrDeviceSlice<F>,
+    dir: NTTDir,
+    cfg: &NTTConfig<F>,
+    output: &mut HostOrDeviceSlice<F>,
+) -> IcicleResult<()>
+```
+
+`ntt:ntt` expects:
+
+`input` - buffer to read the inputs of the NTT from. <br/>
+`dir` - whether to compute forward or inverse NTT. <br/>
+`cfg` - config used to specify extra arguments of the NTT. <br/>
+`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
+
+The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
+
+### NTT Config
+
+```rust
+pub struct NTTConfig<'a, S> {
+    pub ctx: DeviceContext<'a>,
+    pub coset_gen: S,
+    pub batch_size: i32,
+    pub ordering: Ordering,
+    are_inputs_on_device: bool,    
+    are_outputs_on_device: bool,
+    pub is_async: bool,
+    pub ntt_algorithm: NttAlgorithm,
+}
+```
+
+The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
+
+#### Fields
+
+- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
+
+- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
+
+- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
+
+- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
+
+- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
+
+- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
+
+- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
+
+- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
+`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
+`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
+
+
+#### Usage
+
+Example initialization with default settings:
+
+```rust
+let default_config = NTTConfig::default();
+```
+
+Customizing the configuration:
+
+```rust
+let custom_config = NTTConfig {
+    ctx: custom_device_context,
+    coset_gen: my_coset_generator,
+    batch_size: 10,
+    ordering: Ordering::kRN,
+    are_inputs_on_device: true,
+    are_outputs_on_device: true,
+    is_async: false,
+    ntt_algorithm: NttAlgorithm::MixedRadix,
+};
+```
+
 ### Ordering

 The `Ordering` enum defines how inputs and outputs are arranged for the NTT operation, offering flexibility in handling data according to different algorithmic needs or compatibility requirements. It primarily affects the sequencing of data points for the transform, which can influence both performance and the compatibility with certain algorithmic approaches. The available ordering options are:
@@ -63,6 +140,15 @@ NTT also supports two different modes `Batch NTT` and `Single NTT`

 Batch NTT allows you to run many NTTs with a single API call, Single MSM will launch a single MSM computation.

+You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
+
+```rust
+let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+cfg.batch_size = 10 // your ntt using this config will run in batch mode.
+```
+
+`batch_size=1` would keep our NTT in single NTT mode.
+
 Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.

 **Single NTT Mode**
@@ -146,11 +232,9 @@ Mixed Radix can reduce the number of stages required to compute for large inputs

 ### Which algorithm should I choose ?

-Both work only on inputs of power of 2 (e.g., 256, 512, 1024).
+Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Its also more suited for inputs which are power of 2 (e.g., 256, 512, 1024). Radix 2 won't necessarily perform better for smaller `logn` with larger batches.

-Radix 2 is faster for small NTTs. A small NTT would be around logN = 16 and batch size 1. Radix 2 won't necessarily perform better for smaller `logn` with larger batches.
-
-Mixed radix on the other hand works better for larger NTTs with larger input sizes.
+Mixed radix on the other hand better for larger NTTs with larger input sizes which are not necessarily power of 2.

 Performance really depends on logn size, batch size, ordering, inverse, coset, coeff-field and which GPU you are using.

--- a/docs/docs/icicle/primitives/overview.md
+++ b/docs/docs/icicle/primitives/overview.md
@@ -6,6 +6,5 @@ This section of the documentation is dedicated to the ICICLE primitives, we will
 ## Supported primitives


- [MSM](./msm.md)
- [NTT](./ntt.md)
+- [MSM](./msm)
 - [Poseidon Hash](./poseidon.md)
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -1,172 +0,0 @@
-# MSM
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Example
-
-```rust
-use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarCfg};
-use icicle_core::{curve::Curve, msm, traits::GenerateRandom};
-use icicle_cuda_runtime::{memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
-    let size: usize = 1 << 10; // Define the number of points and scalars
-
-    // Generate random points and scalars
-    println!("Generating random G1 points and scalars for BN254...");
-    let points = CurveCfg::generate_random_affine_points(size);
-    let scalars = ScalarCfg::generate_random(size);
-
-    // Wrap points and scalars in HostOrDeviceSlice for MSM
-    let points_host = HostOrDeviceSlice::Host(points);
-    let scalars_host = HostOrDeviceSlice::Host(scalars);
-
-    // Allocate memory on the CUDA device for MSM results
-    let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).expect("Failed to allocate CUDA memory for MSM results");
-
-    // Create a CUDA stream for asynchronous execution
-    let stream = CudaStream::create().expect("Failed to create CUDA stream");
-    let mut cfg = msm::MSMConfig::default();
-    cfg.ctx.stream = &stream;
-    cfg.is_async = true; // Enable asynchronous execution
-
-    // Execute MSM on the device
-    println!("Executing MSM on device...");
-    msm::msm(&scalars_host, &points_host, &cfg, &mut msm_results).expect("Failed to execute MSM");
-
-    // Synchronize CUDA stream to ensure MSM execution is complete
-    stream.synchronize().expect("Failed to synchronize CUDA stream");
-
-    // Optionally, move results to host for further processing or printing
-    println!("MSM execution complete.");
-}
-```
-
-## MSM API Overview
-
-```rust
-pub fn msm<C: Curve>(
-    scalars: &HostOrDeviceSlice<C::ScalarField>,
-    points: &HostOrDeviceSlice<Affine<C>>,
-    cfg: &MSMConfig,
-    results: &mut HostOrDeviceSlice<Projective<C>>,
-) -> IcicleResult<()>
-```
-
-### Parameters
-
- **`scalars`**: A buffer containing the scalar values to be multiplied with corresponding points.
- **`points`**: A buffer containing the points to be multiplied by the scalars.
- **`cfg`**: MSM configuration specifying additional parameters for the operation.
- **`results`**: A buffer where the results of the MSM operations will be stored.
-
-### MSM Config
-
-```rust
-pub struct MSMConfig<'a> {
-    pub ctx: DeviceContext<'a>,
-    points_size: i32,
-    pub precompute_factor: i32,
-    pub c: i32,
-    pub bitsize: i32,
-    pub large_bucket_factor: i32,
-    batch_size: i32,
-    are_scalars_on_device: bool,
-    pub are_scalars_montgomery_form: bool,
-    are_points_on_device: bool,
-    pub are_points_montgomery_form: bool,
-    are_results_on_device: bool,
-    pub is_big_triangle: bool,
-    pub is_async: bool,
-}
-```
-
- **`ctx: DeviceContext`**: Specifies the device context, device id and the CUDA stream for asynchronous execution.
- **`point_size: i32`**: 
- **`precompute_factor: i32`**: Determines the number of extra points to pre-compute for each point, affecting memory footprint and performance.
- **`c: i32`**: The "window bitsize," a parameter controlling the computational complexity and memory footprint of the MSM operation.
- **`bitsize: i32`**: The number of bits of the largest scalar, typically equal to the bit size of the scalar field.
- **`large_bucket_factor: i32`**: Adjusts the algorithm's sensitivity to frequently occurring buckets, useful for non-uniform scalar distributions.
- **`batch_size: i32`**: The number of MSMs to compute in a single batch, for leveraging parallelism.
- **`are_scalars_montgomery_form`**: Set to `true` if scalars are in montgomery form.
- **`are_points_montgomery_form`**: Set to `true` if points are in montgomery form.
- **`are_scalars_on_device: bool`**, **`are_points_on_device: bool`**, **`are_results_on_device: bool`**: Indicate whether the corresponding buffers are on the device memory.
- **`is_big_triangle`**: If `true` MSM will run in Large triangle accumulation if `false` Bucket accumulation will be chosen. Default value: false.
- **`is_async: bool`**: Whether to perform the MSM operation asynchronously.
-
-### Usage
-
-The `msm` function is designed to compute the sum of multiple scalar-point multiplications efficiently. It supports both single MSM operations and batched operations for increased performance. The configuration allows for detailed control over the execution environment and performance characteristics of the MSM operation.
-
-When performing MSM operations, it's crucial to match the size of the `scalars` and `points` arrays correctly and ensure that the `results` buffer is appropriately sized to hold the output. The `MSMConfig` should be set up to reflect the specifics of the operation, including whether the operation should be asynchronous and any device-specific settings.
-
-## How do I toggle between the supported algorithms?
-
-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
-
-```rust
-...
-
-let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
-
-// is_big_triangle will determine which algorithm to use 
-cfg_bls12377.is_big_triangle = true;
-
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-...
-```
-
-You may reference the rust code [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L54).
-
-
-## How do I toggle between MSM modes?
-
-Toggling between MSM modes occurs automatically based on the number of results you are expecting from the `msm::msm` function. If you are expecting an array of `msm_results`, ICICLE will automatically split `scalars` and `points` into equal parts and run them as multiple MSMs in parallel.
-
-```rust
-...
-
-let mut msm_result: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_result).unwrap();
-
-...
-```
-
-In the example above we allocate a single expected result which the MSM method will interpret as `batch_size=1` and run a single MSM.
-
-
-In the next example, we are expecting 10 results which sets `batch_size=10` and runs 10 MSMs in batch mode.
-
-```rust
-...
-
-let mut msm_results: HostOrDeviceSlice<'_, G1Projective> = HostOrDeviceSlice::cuda_malloc(10).unwrap();
-msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();
-
-...
-```
-
-Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).
-
-## Support for G2 group
-
-MSM also supports G2 group. 
-
-Using MSM in G2 requires a G2 config, and of course your Points should also be G2 Points.
-
-```rust
-... 
-
-let scalars = HostOrDeviceSlice::Host(upper_scalars[..size].to_vec());
-let g2_points = HostOrDeviceSlice::Host(g2_upper_points[..size].to_vec());
-let mut g2_msm_results: HostOrDeviceSlice<'_, G2Projective> = HostOrDeviceSlice::cuda_malloc(1).unwrap();
-let mut g2_cfg = msm::get_default_msm_config::<G2CurveCfg>();
-
-msm::msm(&scalars, &g2_points, &g2_cfg, &mut g2_msm_results).unwrap();
-
-...
-```
-
-Here you can [find an example](https://github.com/ingonyama-zk/icicle/blob/5a96f9937d0a7176d88c766bd3ef2062b0c26c37/examples/rust/msm/src/main.rs#L114) of MSM on G2 Points.
--- a/docs/docs/icicle/rust-bindings/multi-gpu.md
+++ b/docs/docs/icicle/rust-bindings/multi-gpu.md
@@ -4,54 +4,6 @@ To learn more about the theory of Multi GPU programming refer to [this part](../

 Here we will cover the core multi GPU apis and a [example](#a-multi-gpu-example)

-
-## A Multi GPU example
-
-In this example we will display how you can
-
-1. Fetch the number of devices installed on a machine
-2. For every GPU launch a thread and set an active device per thread.
-3. Execute a MSM on each GPU
-
-
-
-```rust
-
-...
-
-let device_count = get_device_count().unwrap();
-
-(0..device_count)
-        .into_par_iter()
-        .for_each(move |device_id| {
-          set_device(device_id).unwrap();
-
-          // you can allocate points and scalars_d here
-
-          let mut cfg = MSMConfig::default_for_device(device_id);
-          cfg.ctx.stream = &stream;
-          cfg.is_async = true;
-          cfg.are_scalars_montgomery_form = true;
-          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
-
-          // collect and process results
-        })
-
-...
-```
-
-
-We use `get_device_count` to fetch the number of connected devices, device IDs will be `0, 1, 2, ..., device_count - 1`
-
-[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
-
-We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
-
-Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
-
-We finally call our `msm` method.
-
-
 ## Device management API

 To streamline device management we offer as part of `icicle-cuda-runtime` package methods for dealing with devices.
@@ -200,3 +152,50 @@ let device_id: i32 = 0; // Example device ID
 check_device(device_id);
 // Ensures that the current context is correctly set for the specified device ID.
 ```
+
+
+## A Multi GPU example
+
+In this example we will display how you can
+
+1. Fetch the number of devices installed on a machine
+2. For every GPU launch a thread and set a active device per thread.
+3. Execute a MSM on each GPU
+
+
+
+```rust
+
+...
+
+let device_count = get_device_count().unwrap();
+
+(0..device_count)
+        .into_par_iter()
+        .for_each(move |device_id| {
+          set_device(device_id).unwrap();
+
+          // you can allocate points and scalars_d here
+
+          let mut cfg = MSMConfig::default_for_device(device_id);
+          cfg.ctx.stream = &stream;
+          cfg.is_async = true;
+          cfg.are_scalars_montgomery_form = true;
+          msm(&scalars_d, &HostOrDeviceSlice::on_host(points), &cfg, &mut msm_results).unwrap();
+
+          // collect and process results
+        })
+
+...
+```
+
+
+We use `get_device_count` to fetch the number of connected devices, device IDs will be `0...device_count-1`
+
+[`into_par_iter`](https://docs.rs/rayon/latest/rayon/iter/trait.IntoParallelIterator.html#tymethod.into_par_iter) is a parallel iterator, you should expect it to launch a thread for every iteration.
+
+We then call `set_device(device_id).unwrap();` it should set the context of that thread to the selected `device_id`.
+
+Any data you now allocate from the context of this thread will be linked to the `device_id`. We create our `MSMConfig` with the selected device ID `let mut cfg = MSMConfig::default_for_device(device_id);`, behind the scene this will create for us a `DeviceContext` configured for that specific GPU. 
+
+We finally call our `msm` method.
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -1,195 +0,0 @@
-# NTT
-
-### Supported curves
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`
-
-## Example 
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::{ntt::{self, NTT}, traits::GenerateRandom};
-use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice, stream::CudaStream};
-
-fn main() {
-    let size = 1 << 12; // Define the size of your input, e.g., 2^10
-
-    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(
-        size.try_into()
-            .unwrap(),
-    )
-
-    // Generate random inputs
-    println!("Generating random inputs...");
-    let scalars = HostOrDeviceSlice::Host(ScalarCfg::generate_random(size));
-
-    // Allocate memory on CUDA device for NTT results
-    let mut ntt_results: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::cuda_malloc(size).expect("Failed to allocate CUDA memory");
-
-    // Create a CUDA stream
-    let stream = CudaStream::create().expect("Failed to create CUDA stream");
-    let ctx = DeviceContext::default(); // Assuming default device context
-    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
-
-    // Configure NTT
-    let mut cfg = ntt::NTTConfig::default();
-    cfg.ctx.stream = &stream;
-    cfg.is_async = true; // Set to true for asynchronous execution
-
-    // Execute NTT on device
-    println!("Executing NTT on device...");
-    ntt::ntt(&scalars, ntt::NTTDir::kForward, &cfg, &mut ntt_results).expect("Failed to execute NTT");
-
-    // Synchronize CUDA stream to ensure completion
-    stream.synchronize().expect("Failed to synchronize CUDA stream");
-
-    // Optionally, move results to host for further processing or verification
-    println!("NTT execution complete.");
-}
-```
-
-## NTT API overview
-
-```rust
-pub fn ntt<F>(
-    input: &HostOrDeviceSlice<F>,
-    dir: NTTDir,
-    cfg: &NTTConfig<F>,
-    output: &mut HostOrDeviceSlice<F>,
-) -> IcicleResult<()>
-```
-
-`ntt:ntt` expects:
-
-`input` - buffer to read the inputs of the NTT from. <br/>
-`dir` - whether to compute forward or inverse NTT. <br/>
-`cfg` - config used to specify extra arguments of the NTT. <br/>
-`output` - buffer to write the NTT outputs into. Must be of the same  size as input.
-
-The `input` and `output` buffers can be on device or on host. Being on host means that they will be transferred to device during runtime.
-
-
-### NTT Config
-
-```rust
-pub struct NTTConfig<'a, S> {
-    pub ctx: DeviceContext<'a>,
-    pub coset_gen: S,
-    pub batch_size: i32,
-    pub ordering: Ordering,
-    are_inputs_on_device: bool,    
-    are_outputs_on_device: bool,
-    pub is_async: bool,
-    pub ntt_algorithm: NttAlgorithm,
-}
-```
-
-The `NTTConfig` struct is a configuration object used to specify parameters for an NTT instance.
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context, including the device ID and the stream ID.
-
- **`coset_gen: S`**: Defines the coset generator used for coset (i)NTTs. By default, this is set to `S::one()`, indicating that no coset is being used.
-
- **`batch_size: i32`**: Determines the number of NTTs to compute in a single batch. The default value is 1, meaning that operations are performed on individual inputs without batching. Batch processing can significantly improve performance by leveraging parallelism in GPU computations.
-
- **`ordering: Ordering`**: Controls the ordering of inputs and outputs for the NTT operation. This field can be used to specify decimation strategies (in time or in frequency) and the type of butterfly algorithm (Cooley-Tukey or Gentleman-Sande). The ordering is crucial for compatibility with various algorithmic approaches and can impact the efficiency of the NTT.
-
- **`are_inputs_on_device: bool`**: Indicates whether the input data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
-
- **`are_outputs_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. If the inputs and outputs are the same pointer NTT will be computed in place.
-
- **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. When set to `true`, the NTT function will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity and correctness.
-
- **`ntt_algorithm: NttAlgorithm`**: Can be one of `Auto`, `Radix2`, `MixedRadix`.
-`Auto` will select `Radix 2` or `Mixed Radix` algorithm based on heuristics.
-`Radix2` and `MixedRadix` will force the use of an algorithm regardless of the input size or other considerations. You should use one of these options when you know for sure that you want to 
-
-
-#### Usage
-
-Example initialization with default settings:
-
-```rust
-let default_config = NTTConfig::default();
-```
-
-Customizing the configuration:
-
-```rust
-let custom_config = NTTConfig {
-    ctx: custom_device_context,
-    coset_gen: my_coset_generator,
-    batch_size: 10,
-    ordering: Ordering::kRN,
-    are_inputs_on_device: true,
-    are_outputs_on_device: true,
-    is_async: false,
-    ntt_algorithm: NttAlgorithm::MixedRadix,
-};
-```
-
-
-### Modes
-
-NTT supports two different modes `Batch NTT` and `Single NTT`
-
-You may toggle between single and batch NTT by simply configure `batch_size` to be larger then 1 in your `NTTConfig`.
-
-```rust
-let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
-cfg.batch_size = 10 // your ntt using this config will run in batch mode.
-```
-
-`batch_size=1` would keep our NTT in single NTT mode.
-
-Deciding weather to use `batch NTT` vs `single NTT` is highly dependent on your application and use case.
-
-### Initializing the NTT Domain
-
-Before performing NTT operations, its necessary to initialize the NTT domain, It only needs to be called once per GPU since the twiddles are cached.
-
-```rust
-ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
-```
-
-### `initialize_domain`
-
-```rust
-pub fn initialize_domain<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: NTT<F>;
-```
-
-#### Parameters
-
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
- **`IcicleResult<()>`**: Will return an error if the operation fails.
-
-### `initialize_domain_fast_twiddles_mode`
-
-Similar to `initialize_domain`, `initialize_domain_fast_twiddles_mode` is a faster implementation and can be used for larger NTTs.
-
-```rust
-pub fn initialize_domain_fast_twiddles_mode<F>(primitive_root: F, ctx: &DeviceContext) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: NTT<F>;
-```
-
-#### Parameters
-
- **`primitive_root`**: The primitive root of unity, chosen based on the maximum NTT size required for the computations. It must be of an order that is a power of two. This root is used to generate twiddle factors that are essential for the NTT operations.
-
- **`ctx`**: A reference to a `DeviceContext` specifying which device and stream the computation should be executed on.
-
-#### Returns
-
- **`IcicleResult<()>`**: Will return an error if the operation fails.
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,159 +0,0 @@
-# Vector Operations API
-
-Our vector operations API which is part of `icicle-cuda-runtime` package, includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. 
-
-
-## Supported curves
-
-Vector operations are supported on the following curves:
-
-`bls12-377`, `bls12-381`, `bn-254`, `bw6-761`, `grumpkin`
-
-## Examples
-
-### Addition of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{add_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-add_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Subtraction of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{sub_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let b: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-sub_scalars(&a, &b, &mut result, &cfg).unwrap();
-```
-
-### Multiplication of Scalars
-
-```rust
-use icicle_bn254::curve::{ScalarCfg, ScalarField};
-use icicle_core::vec_ops::{mul_scalars};
-
-let test_size = 1 << 18;
-
-let a: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(F::Config::generate_random(test_size));
-let ones: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::one(); test_size]);
-let mut result: HostOrDeviceSlice<'_, ScalarField> = HostOrDeviceSlice::on_host(vec![F::zero(); test_size]);
-
-let cfg = VecOpsConfig::default();
-mul_scalars(&a, &ones, &mut result, &cfg).unwrap();
-```
-
-
-## Vector Operations Configuration
-
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
-
-### `VecOpsConfig`
-
-Defines configuration parameters for vector operations.
-
-```rust
-pub struct VecOpsConfig<'a> {
-    pub ctx: DeviceContext<'a>,
-    is_a_on_device: bool,
-    is_b_on_device: bool,
-    is_result_on_device: bool,
-    is_result_montgomery_form: bool,
-    pub is_async: bool,
-}
-```
-
-#### Fields
-
- **`ctx: DeviceContext<'a>`**: Specifies the device context for the operation, including the device ID and memory pool.
- **`is_a_on_device`**: Indicates if the first operand vector resides in device memory.
- **`is_b_on_device`**: Indicates if the second operand vector resides in device memory.
- **`is_result_on_device`**: Specifies if the result vector should be stored in device memory.
- **`is_result_montgomery_form`**: Determines if the result should be in Montgomery form.
- **`is_async`**: Enables asynchronous operation. If `true`, operations are non-blocking; otherwise, they block the current thread.
-
-### Default Configuration
-
-`VecOpsConfig` can be initialized with default settings tailored for a specific device:
-
-```
-let cfg = VecOpsConfig::default();
-```
-
-These are the default settings.
-
-```rust
-impl<'a> Default for VecOpsConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
-}
-
-impl<'a> VecOpsConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        VecOpsConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            is_a_on_device: false,
-            is_b_on_device: false,
-            is_result_on_device: false,
-            is_result_montgomery_form: false,
-            is_async: false,
-        }
-    }
-}
-```
-
-## Vector Operations
-
-Vector operations are implemented through the `VecOps` trait, these traits are implemented for all [supported curves](#supported-curves) providing methods for addition, subtraction, and multiplication of vectors.
-
-### `VecOps` Trait
-
-```rust
-pub trait VecOps<F> {
-    fn add(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn sub(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-
-    fn mul(
-        a: &HostOrDeviceSlice<F>,
-        b: &HostOrDeviceSlice<F>,
-        result: &mut HostOrDeviceSlice<F>,
-        cfg: &VecOpsConfig,
-    ) -> IcicleResult<()>;
-}
-```
-
-#### Methods
-
-All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
-
- **`add`**: Computes the element-wise sum of two vectors.
- **`sub`**: Computes the element-wise difference between two vectors.
- **`mul`**: Performs element-wise multiplication of two vectors.
--- a/docs/docs/icicle/supporting-additional-curves.md
+++ b/docs/docs/icicle/supporting-additional-curves.md
@@ -6,56 +6,7 @@ We understand the need for ZK developers to use different curves, some common so

 ICICLE core is very generic by design so all algorithms and primitives are designed to work based of configuration files [selected during compile](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh) time. This is why we compile ICICLE Core per curve.

-To add support for a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.
-
-### Adding curve_name_params.cuh
-
-Start by copying `bn254_params.cuh` contents in your params file. Params should include:
- - **fq_config** - parameters of the Base field.
-    - **limbs_count** - `ceil(field_byte_size / 4)`.
-    - **modulus_bit_count** - bit-size of the modulus.
-    - **num_of_reductions** - the number of times to reduce in reduce function. Use 2 if not sure.
-    - **modulus** - modulus of the field.
-    - **modulus_2** - modulus * 2.
-    - **modulus_4** - modulus * 4. 
-    - **neg_modulus** - negated modulus. 
-    - **modulus_wide** - modulus represented as a double-sized integer.
-    - **modulus_squared** - modulus**2 represented as a double-sized integer.
-    - **modulus_squared_2** - 2 * modulus**2 represented as a double-sized integer.
-    - **modulus_squared_4** - 4 * modulus**2 represented as a double-sized integer.
-    - **m** - value used in multiplication. Can be computed as `2**(2*modulus_bit_count) // modulus`. 
-    - **one** - multiplicative identity. 
-    - **zero** - additive identity. 
-    - **montgomery_r** - `2 ** M % modulus` where M is a closest (larger than) bitsize multiple of 32. E.g. 384 or 768 for bls and bw curves respectively
-    - **montgomery_r_inv** - `2 ** (-M) % modulus`
- - **fp_config** - parameters of the Scalar field.
-    Same as fq_config, but with additional arguments:
-    - **omegas_count** - [two-adicity](https://cryptologie.net/article/559/whats-two-adicity/) of the field. And thus the maximum size of NTT.
-    - **omegas** - an array of omegas for NTTs. An array of size `omegas_count`. The ith element is equal to `1.nth_root(2**(2**(omegas_count-i)))`.
-    - **inv** - an array of inverses of powers of two in a field. Ith element is equal to `(2 ** (i+1)) ** -1`.
- - **G1 generators points** - affine coordinates of the generator point.
- - **G2 generators points** - affine coordinates of the extension generator. Remove these if `G2` is not supported.
- - **Weierstrass b value** - base field element equal to value of `b` in the curve equation.
- - **Weierstrass b value G2** - base field element equal to value of `b` for the extension. Remove this if `G2` is not supported.
- 
- :::note
-
- All the params are not in Montgomery form.
- 
- :::
- 
- :::note
-
- To convert number values into `storage` type you can use the following python function
-
-```python
-import struct
-
-def unpack(x, field_size):
-    return ', '.join(["0x" + format(x, '08x') for x in struct.unpack('I' * (field_size) // 4, int(x).to_bytes(field_size, 'little'))])
-```
-
-:::
+To add support a new curve you must create a new file under [`icicle/curves`](https://github.com/ingonyama-zk/icicle/tree/main/icicle/curves). The file should be named `<curve_name>_params.cuh`.

 We also require some changes to [`curve_config.cuh`](https://github.com/ingonyama-zk/icicle/blob/main/icicle/curves/curve_config.cuh#L16-L29), we need to add a new curve id.

@@ -77,40 +28,58 @@ Make sure to modify the [rest of the file](https://github.com/ingonyama-zk/icicl
 Finally we must modify the [`make` file](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L64) to make sure we can compile our new curve.

 ```
-set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;grumpkin;<curve_name>)
+set(SUPPORTED_CURVES bn254;bls12_381;bls12_377;bw6_761;<curve_name>)
 ```

-### Adding Poseidon support
-
-If you want your curve to implement a Poseidon hash function or a tree builder, you will need to pre-calculate its optimized parameters.  
-Copy [constants_template.h](https://github.com/ingonyama-zk/icicle/blob/main/icicle/appUtils/poseidon/constants/constants_template.h) into `icicle/appUtils/poseidon/constants/<CURVE>_poseidon.h`. Run the [constants generation script](https://dev.ingonyama.com/icicle/primitives/poseidon#constants). The script will print the number of partial rounds and generate a `constants.bin` file. Use `xxd -i constants.bin` to parse the file into C declarations. Copy the `unsigned char constants_bin[]` contents inside your new file. Repeat this process for arities 2, 4, 8 and 11.
-
-After you've generated the constants, add your curve in this [SUPPORTED_CURVES_WITH_POSEIDON](https://github.com/ingonyama-zk/icicle/blob/main/icicle/CMakeLists.txt#L72) in the `CMakeLists.txt`.
-
 ## Bindings

-In order to support a new curve in the binding libraries you first must support it in ICICLE core.
+In order to support a new curves in the binding libraries you first must support it in ICICLE core.

 ### Rust

-Go to [rust curves folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves) and copy `icicle-curve-template` to a new folder named `icicle-<curve_name>`.
+Create a new folder named `icicle-<curve_name>` under the [rust wrappers folder](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-curves). Your new directory should look like this.

-Find all the occurrences of `<CURVE>` placeholder inside the crate. (You can use `Ctrl+Shift+F` in VS Code or `grep -nr "<CURVE>"` in bash). You will then need to replace each occurrence with your new curve name.
+```
+└── rust
+    ├── icicle-curves
+        ├── icicle-<curve_name>
+    │   │   ├── Cargo.toml
+    │   │   ├── build.rs
+    │   │   └── src/
+    │   │       ├── curve.rs
+    │   │       ├── lib.rs
+    │   │       ├── msm/
+    │   │       │   └── mod.rs
+    │   │       └── ntt/
+    │   │           └── mod.rs
+```

-#### Limbs
+Lets look at [`ntt/mod.rs`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/src/ntt/mod.rs) for example.

-Go to your curve's `curve.rs` file and set `SCALAR_LIMBS`, `BASE_LIMBS` and `G2_BASE_LIMBS` (if G2 is needed) to a minimum number of `u64` required to store a single scalar field / base field element respectively.  
-e.g. for bn254, scalar field is 254 bit so `SCALAR_LIMBS` is set to 4.
+```
+...

-#### Primitives
+extern "C" {
+    #[link_name = "bn254NTTCuda"]
+    fn ntt_cuda<'a>(
+        input: *const ScalarField,
+        size: usize,
+        is_inverse: bool,
+        config: &NTTConfig<'a, ScalarField>,
+        output: *mut ScalarField,
+    ) -> CudaError;

-If your curve doesn't support some of the primitives (ntt/msm/poseidon/merkle tree/), or you simply don't want to include it, just remove a corresponding module from `src` and then from `lib.rs`
+    #[link_name = "bn254DefaultNTTConfig"]
+    fn default_ntt_config() -> NTTConfig<'static, ScalarField>;

-#### G2
+    #[link_name = "bn254InitializeDomain"]
+    fn initialize_ntt_domain(primitive_root: ScalarField, ctx: &DeviceContext) -> CudaError;
+}

-If your curve doesn't support G2 - remove all the code under `#[cfg(feature = "g2")]` and remove the feature from [Cargo.toml](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml#L29) and [build.rs](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-curves/icicle-bn254/build.rs#L15).
+...
+```

-After this is done, add your new crate in the [global Cargo.toml](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/Cargo.toml).
+Here you would need to replace `bn254NTTCuda` with `<curve_name>NTTCuda`. Most of these changes are pretty straight forward. One thing you should pay attention to is limb sizes as these change for different curves. For example `BN254` [has limb size of 8](https://github.com/ingonyama-zk/icicle/blob/4beda3a900eda961f39af3a496f8184c52bf3b41/wrappers/rust/icicle-curves/icicle-bn254/src/curve.rs#L15) but for your curve this may be different.

 ### Golang

--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -9,7 +9,7 @@ const config = {
  title: 'Ingonyama Developer Documentation',
  tagline: 'Ingonyama is a next-generation semiconductor company, focusing on Zero-Knowledge Proof hardware acceleration. We build accelerators for advanced cryptography, unlocking real-time applications.',
  url: 'https://dev.ingonyama.com/',
-  baseUrl: '/',
+  baseUrl: '/icicle/',
  onBrokenLinks: 'throw',
  onBrokenMarkdownLinks: 'warn',
  favicon: 'img/logo.png',
@@ -29,13 +29,13 @@ const config = {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          sidebarPath: require.resolve('./sidebars.js'),
-          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
        },
        blog: {
          remarkPlugins: [math, require('mdx-mermaid')],
          rehypePlugins: [katex],
          showReadingTime: true,
-          editUrl: 'https://github.com/ingonyama-zk/icicle/tree/main',
+          editUrl: 'https://github.com/ingonyama-zk/developer-docs/tree/main',
        },
        pages: {},
        theme: {
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -25,30 +25,9 @@ module.exports = {
          id: "icicle/integrations"
        },
        {
-          type: "category",
+          type: "doc",
          label: "Golang bindings",
-          link: {
-            type: `doc`,
-            id: "icicle/golang-bindings",
-          },
-          collapsed: true,
-          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/golang-bindings/msm",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/golang-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/golang-bindings/vec-ops",
-            },
-          ]
+          id: "icicle/golang-bindings",
        },
        {
          type: "category",
@@ -59,27 +38,12 @@ module.exports = {
          },
          collapsed: true,
          items: [
-            {
-              type: "doc",
-              label: "MSM",
-              id: "icicle/rust-bindings/msm",
-            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/rust-bindings/ntt",
-            },
-            {
-              type: "doc",
-              label: "Vector operations",
-              id: "icicle/rust-bindings/vec-ops",
-            },
            {
              type: "doc",
              label: "Multi GPU Support",
              id: "icicle/rust-bindings/multi-gpu",
-            },
-          ],
+            }
+          ]
        },
        {
          type: "category",
@@ -95,16 +59,16 @@ module.exports = {
              label: "MSM",
              id: "icicle/primitives/msm",
            },
-            {
-              type: "doc",
-              label: "NTT",
-              id: "icicle/primitives/ntt",
-            },
            {
              type: "doc",
              label: "Poseidon Hash",
              id: "icicle/primitives/poseidon",
            },
+            {
+              type: "doc",
+              label: "NTT",
+              id: "icicle/primitives/ntt",
+            }
          ],
        },
        {
--- a/examples/c++/pedersen-commitment/CMakeLists.txt
+++ b/examples/c++/pedersen-commitment/CMakeLists.txt
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(icicle LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-# change the path to your Icicle location
-include_directories("../../../icicle")
-add_executable(
-  example
-  example.cu
-)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
--- a/examples/c++/pedersen-commitment/README.md
+++ b/examples/c++/pedersen-commitment/README.md
@@ -1,33 +0,0 @@
-# ICICLE example: Pedersen Commitment
-
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
-
-## Key-Takeaway
-
-A Pedersen Commitment is a cryptographic primitive to commit to a value or a vector of values while keeping it hidden, yet enabling the committer to reveal the value later. It provides both hiding (the commitment does not reveal any information about the value) and binding properties (once a value is committed, it cannot be changed without detection).
-
-Pedersen commitment is based on Multi-Scalar Multiplication [MSM](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
-`ICICLE` provides CUDA C++ support for [MSM](https://dev.ingonyama.com/icicle/primitives/msm). 
-An example of MSM is [here](../msm/README.md).
-
-## Running the example
-
- `cd` to your example directory
- compile with  `./compile.sh`
- run with `./run.sh`
-
-## Concise Explanation
-
-We recommend this simple [explanation](https://www.rareskills.io/post/pedersen-commitment).
-
-The original paper: T. P. Pedersen, "Non-Interactive and Information-Theoretic Secure Verifiable Secret Sharing," in Advances in Cryptology — CRYPTO ’91, Lecture Notes in Computer Science, vol 576. Springer, Berlin, Heidelberg.
-
-## What's in the example
-
-1. Define the curve and the size of commitment vector
-2. Use public random seed to transparently generate points on the elliptic curve without known discrete logarithm
-3. Generate (random) commitment vector and salt (a.k.a blinding factor)
-4. Configure and execute MSM using on-host data
-5. Output commitment as elliptic point
--- a/examples/c++/pedersen-commitment/compile.sh
+++ b/examples/c++/pedersen-commitment/compile.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# Exit immediately on error
-set -e
-
-rm -rf build
-mkdir -p build
-cmake -S . -B build
-cmake --build build
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -1,159 +0,0 @@
-#include <iostream>
-#include <iomanip>
-#include <chrono>
-#include <cassert>
-#include <nvml.h>
-
-#define CURVE_ID BN254
-#include "appUtils/msm/msm.cu"
-using namespace curve_config;
-
-typedef point_field_t T;
-
-// modular power
-T modPow(T base, T exp) {
-  T r = T::one();
-  T b = base;
-  T e = exp;
-  while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
-  }
-  return r;
-}
-
-// Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
-
-// modular square root adapted from:
-// https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
-  if (a == T::zero()) {
-    *result = T::zero();
-    return true;
-  }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
-    return false; // solution does not exist
-  }
-  // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
-  T m;
-  while (T::is_even(s)) {
-    r = r + T::one();
-    s = T::div2(s); //s /= 2;
-  }
-  // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
-    n = n + T::one();
-  }
-  T x = modPow(a, T::div2(s + T::one()));
-  T b = modPow(a, s);
-  T g = modPow(n, s);
-  for (;; r = m) {
-    T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
-      *result = x;
-      return true;
-    }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
-  }
-}
-
-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
-  T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
-    x = x + T::one();
-  };
-  T y;
-  bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
-  point->x = x;
-  point->y = y;
-}
-
-static int seed = 0;
-static HOST_INLINE T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
-    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
-
-using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
-#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
-int main(int argc, char** argv)
-{
-  const unsigned N = pow(2, 10);
-  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
-  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
-  seed = 1234;
-  std::cout << "Using seed: " << seed << std::endl;
-  std::cout << "Generating random field values" << std::endl;
-  START_TIMER(gen);
-  
-  for (unsigned i = 0; i < N; i++) {
-    xs[i] = rand_host_seed();
-  }
-  END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
-  // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
-  std::cout << "Generating point about random field values" << std::endl;
-  START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
-    point_near_x(xs[i], &points[i]);
-  }
-  END_TIMER(points, "Time to generate points");
-  
-  std::cout << "Generating commitment vector" << std::endl;
-  projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
-  scalar_t::RandHostMany(scalars, N);
-
-  std::cout << "Generating salt" << std::endl;
-  scalars[N] = scalar_t::rand_host();
-
-  std::cout << "Executing MSM" << std::endl;
-  auto config = msm::DefaultMSMConfig<scalar_t>();
-  START_TIMER(msm);
-  msm::MSM<scalar_t, affine_t, projective_t>(scalars, points, N+1, config, &result);
-  END_TIMER(msm, "Time to execute MSM");
-
-  std::cout << "Computed commitment: " << result << std::endl;
-
-  std::cout << "Cleaning up..." << std::endl;
-  delete[] xs;
-  delete[] scalars;
-  delete[] points;
-  return 0;
-}
--- a/examples/c++/pedersen-commitment/run.sh
+++ b/examples/c++/pedersen-commitment/run.sh
@@ -1,2 +0,0 @@
-#!/bin/bash
-./build/example
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -4,12 +4,12 @@

 #include <cuda_runtime.h>

-#include "curves/curve_config.cuh"
-#include "primitives/affine.cuh"
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include "utils/device_context.cuh"
-#include "utils/error_handler.cuh"
+#include "../../curves/curve_config.cuh"
+#include "../../primitives/affine.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/device_context.cuh"
+#include "../../utils/error_handler.cuh"

 /**
 * @namespace msm
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -6,10 +6,11 @@
 #include <iostream>
 #include <vector>

-#include "curves/curve_config.cuh"
-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
-#include "utils/device_context.cuh"
+#include "../../curves/curve_config.cuh"
+#include "../../primitives/field.cuh"
+#include "../../primitives/projective.cuh"
+#include "../../utils/cuda_utils.cuh"
+#include "../../utils/device_context.cuh"

 class Dummy_Scalar
 {
--- a/icicle/appUtils/poseidon/constants/constants_template.h
+++ b/icicle/appUtils/poseidon/constants/constants_template.h
@@ -1,39 +0,0 @@
-#pragma once
-#ifndef CURVE_POSEIDON_H
-#define CURVE_POSEIDON_H
-
-namespace poseidon_constants_curve {
-  /**
-   * This inner namespace contains optimized constants for running Poseidon.
-   * These constants were generated using an algorithm defined at
-   * https://spec.filecoin.io/algorithms/crypto/poseidon/
-   * The number in the name corresponds to the arity of hash function
-   * Each array contains:
-   * RoundConstants | MDSMatrix | Non-sparse matrix | Sparse matrices
-  */
-
-  int partial_rounds_2 = 0;
-
-  int partial_rounds_4 = 0;
-
-  int partial_rounds_8 = 0;
-
-  int partial_rounds_11 = 0;
-
-    unsigned char poseidon_constants_2[] = {
-        0x00
-    };
-
-    unsigned char poseidon_constants_4[] = {
-        0x00
-    };
-
-    unsigned char poseidon_constants_8[] = {
-        0x00
-    };
-
-    unsigned char poseidon_constants_11[] = {
-        0x00
-    };
-} // namespace poseidon_constants
-#endif
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -20,7 +20,7 @@ namespace poseidon {

  /**
   * For most of the Poseidon configurations this is the case
-   * TODO: Add support for different full rounds numbers
+   * To-do: Add support for different full rounds numbers
   */
  const int FULL_ROUNDS_DEFAULT = 4;

--- a/icicle/appUtils/poseidon/test.cu
+++ b/icicle/appUtils/poseidon/test.cu
@@ -1,8 +1,8 @@
 // #define DEBUG

 #define CURVE_ID 2
-#include "curves/curve_config.cuh"
-#include "utils/device_context.cuh"
+#include "../../curves/curve_config.cuh"
+#include "../../utils/device_context.cuh"
 #include "poseidon.cu"

 #ifndef __CUDA_ARCH__
--- a/icicle/appUtils/tree/test.cu
+++ b/icicle/appUtils/tree/test.cu
@@ -2,8 +2,8 @@
 #define MERKLE_DEBUG

 #define CURVE_ID 2
-#include "curves/curve_config.cuh"
-#include "appUtils/poseidon/poseidon.cu"
+#include "../../curves/curve_config.cuh"
+#include "../poseidon/poseidon.cu"
 #include "merkle.cu"

 #ifndef __CUDA_ARCH__
--- a/icicle/curves/bls12_377_params.cuh
+++ b/icicle/curves/bls12_377_params.cuh
@@ -2,7 +2,7 @@
 #ifndef BLS12_377_PARAMS_H
 #define BLS12_377_PARAMS_H

-#include "utils/storage.cuh"
+#include "../utils/storage.cuh"

 namespace bls12_377 {
  struct fp_config {
--- a/icicle/curves/bls12_381_params.cuh
+++ b/icicle/curves/bls12_381_params.cuh
@@ -2,7 +2,7 @@
 #ifndef BLS12_381_PARAMS_H
 #define BLS12_381_PARAMS_H

-#include "utils/storage.cuh"
+#include "../utils/storage.cuh"

 namespace bls12_381 {
  struct fp_config {
--- a/icicle/curves/bn254_params.cuh
+++ b/icicle/curves/bn254_params.cuh
@@ -2,7 +2,7 @@
 #ifndef BN254_PARAMS_H
 #define BN254_PARAMS_H

-#include "utils/storage.cuh"
+#include "../utils/storage.cuh"

 namespace bn254 {
  struct fp_config {
--- a/icicle/curves/bw6_761_params.cuh
+++ b/icicle/curves/bw6_761_params.cuh
@@ -2,7 +2,7 @@
 #ifndef BW6_761_PARAMS_H
 #define BW6_761_PARAMS_H

-#include "utils/storage.cuh"
+#include "../utils/storage.cuh"

 namespace bw6_761 {
  struct fq_config {
--- a/icicle/curves/curve_config.cuh
+++ b/icicle/curves/curve_config.cuh
@@ -8,10 +8,10 @@
 #define BW6_761   4
 #define GRUMPKIN  5

-#include "primitives/field.cuh"
-#include "primitives/projective.cuh"
+#include "../primitives/field.cuh"
+#include "../primitives/projective.cuh"
 #if defined(G2_DEFINED)
-#include "primitives/extension_field.cuh"
+#include "../primitives/extension_field.cuh"
 #endif

 #if CURVE_ID == BN254
--- a/icicle/curves/grumpkin_params.cuh
+++ b/icicle/curves/grumpkin_params.cuh
@@ -2,7 +2,7 @@
 #ifndef GRUMPKIN_PARAMS_H
 #define GRUMPKIN_PARAMS_H

-#include "utils/storage.cuh"
+#include "../utils/storage.cuh"
 #include "bn254_params.cuh"

 namespace grumpkin {
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
@@ -18,10 +18,10 @@

 #pragma once

-#include "utils/error_handler.cuh"
-#include "utils/host_math.cuh"
-#include "utils/ptx.cuh"
-#include "utils/storage.cuh"
+#include "../utils/error_handler.cuh"
+#include "../utils/host_math.cuh"
+#include "../utils/ptx.cuh"
+#include "../utils/storage.cuh"
 #include <iomanip>
 #include <iostream>
 #include <random>
--- a/icicle/primitives/projective.cu
+++ b/icicle/primitives/projective.cu
@@ -3,7 +3,9 @@
 #include <cuda.h>
 #include "utils/utils.h"

-using namespace curve_config;
+#define projective_t  curve_config::projective_t // TODO: global to avoid lengthy texts
+#define affine_t      curve_config::affine_t
+#define point_field_t curve_config::point_field_t

 extern "C" bool CONCAT_EXPAND(CURVE, Eq)(projective_t* point1, projective_t* point2)
 {
@@ -31,7 +33,9 @@ extern "C" void CONCAT_EXPAND(CURVE, GenerateAffinePoints)(affine_t* points, int

 #if defined(G2_DEFINED)

-using namespace curve_config;
+#define g2_projective_t  curve_config::g2_projective_t
+#define g2_affine_t      curve_config::g2_affine_t
+#define g2_point_field_t curve_config::g2_point_field_t

 extern "C" bool CONCAT_EXPAND(CURVE, G2Eq)(g2_projective_t* point1, g2_projective_t* point2)
 {
--- a/icicle/primitives/test_kernels.cuh
+++ b/icicle/primitives/test_kernels.cuh
@@ -1,6 +1,13 @@
 #pragma once

-#include "curves/curve_config.cuh"
+#ifndef G2_DEFINED
+#define G2_DEFINED
+
+#include "../curves/curve_config.cuh"
+#include "extension_field.cuh"
+#include "projective.cuh"
+
+#endif

 using namespace curve_config;

--- a/icicle/tests/device_error_test.cu
+++ b/icicle/tests/device_error_test.cu
@@ -1,4 +1,4 @@
-#include "utils/error_handler.cuh" // Include your error handling header file
+#include "../utils/error_handler.cuh" // Include your error handling header file
 #include <gtest/gtest.h>

 __global__ void a_kernel_with_conditional_sticky_error(bool is_failing)
--- a/icicle/tests/error_handler_test.cu
+++ b/icicle/tests/error_handler_test.cu
@@ -1,4 +1,4 @@
-#include "utils/error_handler.cuh" // Include your error handling header file
+#include "../utils/error_handler.cuh" // Include your error handling header file
 #include <gtest/gtest.h>

 class IcicleErrorTest : public ::testing::Test
--- a/icicle/tests/primitives_test.cu
+++ b/icicle/tests/primitives_test.cu
@@ -1,4 +1,4 @@
-#include "primitives/test_kernels.cuh"
+#include "../primitives/test_kernels.cuh"
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
 #include <iostream>
--- a/icicle/utils/objects.cuh
+++ b/icicle/utils/objects.cuh
@@ -0,0 +1,63 @@
+#pragma once
+template <class F>
+class Element
+{
+public:
+  int v;
+  __device__ __host__ Element<F>() { v = 0; }
+  __device__ __host__ Element<F>(int r)
+  {
+    v = r % F::q;
+    if (r == F::q) v = F::q;
+  }
+  __device__ __host__ Element<F> operator+(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator-(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
+};
+
+template <class F>
+class Scalar
+{
+public:
+  int v;
+  __device__ __host__ Scalar<F>() { v = 0; }
+  __device__ __host__ Scalar<F>(int r) { v = r % F::q; }
+  __device__ __host__ Scalar<F> operator+(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v + obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Scalar<F> operator*(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  __device__ __host__ Element<F> operator*(Element<F> const& obj)
+  {
+    Element<F> res;
+    res.v = (v * obj.v) % F::q;
+    return res;
+  }
+  Scalar<F> operator-(Scalar<F> const& obj)
+  {
+    Scalar<F> res;
+    res.v = (v - obj.v) % F::q;
+    if (res.v < 0) { res.v = F::q + res.v; }
+    return res;
+  }
+  bool operator<(Scalar<F> const& obj) { return v < obj.v; }
+  static Scalar<F> one() { return Scalar<F>(1); }
+  static Scalar<F> zero() { return Scalar<F>(0); }
+};
--- a/icicle/utils/sharedmem.cuh
+++ b/icicle/utils/sharedmem.cuh
@@ -1,3 +1,4 @@
+// TODO: remove this file, seems working without it
 // based on https://leimao.github.io/blog/CUDA-Shared-Memory-Templated-Kernel/
 // may be outdated, but only worked like that

@@ -58,7 +59,7 @@
 #ifndef _SHAREDMEM_H_
 #define _SHAREDMEM_H_

-#include "curves/curve_config.cuh"
+#include "../curves/curve_config.cuh"

 /** @brief Wrapper class for templatized dynamic shared memory arrays.
 *
--- a/icicle/utils/utils_kernels.cuh
+++ b/icicle/utils/utils_kernels.cuh
@@ -2,7 +2,17 @@
 #ifndef UTILS_KERNELS_H
 #define UTILS_KERNELS_H

+#include "utils_kernels.cuh"
+
 namespace utils_internal {
+  // TODO: weird linking issue - only works in headers
+  // template <typename E, typename S>
+  // __global__ void NormalizeKernel(E* arr, S scalar, unsigned n)
+  // {
+  //   int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  //   if (tid < n) { arr[tid] = scalar * arr[tid]; }
+  // }
+
  template <typename E, typename S>
  __global__ void NormalizeKernel(E* arr, S scalar, int n)
  {
--- a/icicle/utils/vec_ops.cu
+++ b/icicle/utils/vec_ops.cu
@@ -2,7 +2,7 @@
 #include <stdexcept>

 #include "vec_ops.cuh"
-#include "curves/curve_config.cuh"
+#include "../curves/curve_config.cuh"
 #include "device_context.cuh"
 #include "mont.cuh"
 #include "utils/utils.h"
--- a/scripts/hooks/pre-push
+++ b/scripts/hooks/pre-push
@@ -36,11 +36,11 @@ fi

 # Run cargo fmt on Rust files
 cd wrappers/rust
-if [[ $(find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]];
+if [[ $(find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --) ]];
 then
    echo "🚨 There are Rust files that need formatting."
-    echo "Please go to wrappers/rust and format the Rust files using the following command:"
-    echo "find . -path ./icicle-curves/icicle-curve-template -prune -o -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --"
+    echo "Please format the Rust files using the following command:"
+    echo "find . -name target -prune -o -iname *.rs -print | xargs cargo fmt --check --"
    status=1
 fi

--- a/wrappers/golang/build.ps1
+++ b/wrappers/golang/build.ps1
@@ -1,23 +0,0 @@
-$G2_DEFINED = "OFF"
-
-if ($args.Count -gt 1) {
-  $G2_DEFINED = "ON"
-}
-
-$BUILD_DIR = (Get-Location).Path + "\..\icicle\build"
-$SUPPORTED_CURVES = @("bn254", "bls12_377", "bls12_381", "bw6_761")
-
-if ($args[0] -eq "all") {
-  $BUILD_CURVES = $SUPPORTED_CURVES
-} else {
-  $BUILD_CURVES = @($args[0])
-}
-
-Set-Location "../../icicle"
-
-New-Item -ItemType Directory -Path "build" -Force
-
-foreach ($CURVE in $BUILD_CURVES) {
-  cmake -DCURVE:STRING=$CURVE -DG2_DEFINED:STRING=$G2_DEFINED -DCMAKE_BUILD_TYPE:STRING=Release -S . -B build
-  cmake --build build
-}
--- a/wrappers/golang/core/error.go
+++ b/wrappers/golang/core/error.go
@@ -1,7 +1,7 @@
 package core

 import (
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 )

 type IcicleErrorCode int
@@ -16,13 +16,13 @@ const (

 type IcicleError struct {
 	IcicleErrorCode IcicleErrorCode
-	CudaErrorCode   cr.CudaError
+	CudaErrorCode   cuda_runtime.CudaError
 	reason          string
 }

-func FromCudaError(error cr.CudaError) (err IcicleError) {
+func FromCudaError(error cuda_runtime.CudaError) (err IcicleError) {
 	switch error {
-	case cr.CudaSuccess:
+	case cuda_runtime.CudaSuccess:
 		err.IcicleErrorCode = IcicleSuccess
 	default:
 		err.IcicleErrorCode = InternalCudaError
@@ -38,6 +38,6 @@ func FromCodeAndReason(code IcicleErrorCode, reason string) IcicleError {
 	return IcicleError{
 		IcicleErrorCode: code,
 		reason:          reason,
-		CudaErrorCode:   cr.CudaErrorUnknown,
+		CudaErrorCode:   cuda_runtime.CudaErrorUnknown,
 	}
 }
--- a/wrappers/golang/core/msm.go
+++ b/wrappers/golang/core/msm.go
@@ -3,12 +3,12 @@ package core
 import (
 	"fmt"

-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 )

 type MSMConfig struct {
 	/// Details related to the device such as its id and stream.
-	Ctx cr.DeviceContext
+	Ctx cuda_runtime.DeviceContext

 	pointsSize int32

@@ -55,8 +55,13 @@ type MSMConfig struct {
 	IsAsync bool
 }

+// type MSM interface {
+// 	Msm(scalars, points *cuda_runtime.HostOrDeviceSlice, cfg *MSMConfig, results *cuda_runtime.HostOrDeviceSlice) cuda_runtime.CudaError
+// 	GetDefaultMSMConfig() MSMConfig
+// }
+
 func GetDefaultMSMConfig() MSMConfig {
-	ctx, _ := cr.GetDefaultDeviceContext()
+	ctx, _ := cuda_runtime.GetDefaultDeviceContext()
 	return MSMConfig{
 		ctx,   // Ctx
 		0,     // pointsSize
--- a/wrappers/golang/core/msm_test.go
+++ b/wrappers/golang/core/msm_test.go
@@ -4,13 +4,13 @@ import (
 	"testing"

 	"github.com/ingonyama-zk/icicle/wrappers/golang/core/internal"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"

 	"github.com/stretchr/testify/assert"
 )

 func TestMSMDefaultConfig(t *testing.T) {
-	ctx, _ := cr.GetDefaultDeviceContext()
+	ctx, _ := cuda_runtime.GetDefaultDeviceContext()
 	expected := MSMConfig{
 		ctx,   // Ctx
 		0,     // pointsSize
--- a/wrappers/golang/core/ntt.go
+++ b/wrappers/golang/core/ntt.go
@@ -3,7 +3,7 @@ package core
 import (
 	"fmt"

-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 )

 type NTTDir int8
@@ -26,7 +26,7 @@ const (

 type NTTConfig[T any] struct {
 	/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-	Ctx cr.DeviceContext
+	Ctx cuda_runtime.DeviceContext
 	/// Coset generator. Used to perform coset (i)NTTs. Default value: `S::one()` (corresponding to no coset being used).
 	CosetGen T
 	/// The number of NTTs to compute. Default value: 1.
@@ -41,7 +41,7 @@ type NTTConfig[T any] struct {
 }

 func GetDefaultNTTConfig[T any](cosetGen T) NTTConfig[T] {
-	ctx, _ := cr.GetDefaultDeviceContext()
+	ctx, _ := cuda_runtime.GetDefaultDeviceContext()
 	return NTTConfig[T]{
 		ctx,      // Ctx
 		cosetGen, // CosetGen
--- a/wrappers/golang/core/ntt_test.go
+++ b/wrappers/golang/core/ntt_test.go
@@ -5,7 +5,7 @@ import (
 	"testing"

 	"github.com/ingonyama-zk/icicle/wrappers/golang/core/internal"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"github.com/stretchr/testify/assert"
 )

@@ -14,7 +14,7 @@ func TestNTTDefaultConfig(t *testing.T) {
 	cosetGenField.One()
 	var cosetGen [1]uint32
 	copy(cosetGen[:], cosetGenField.GetLimbs())
-	ctx, _ := cr.GetDefaultDeviceContext()
+	ctx, _ := cuda_runtime.GetDefaultDeviceContext()
 	expected := NTTConfig[[1]uint32]{
 		ctx,      // Ctx
 		cosetGen, // CosetGen
--- a/wrappers/golang/core/slice.go
+++ b/wrappers/golang/core/slice.go
@@ -3,7 +3,7 @@ package core
 import (
 	"unsafe"

-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 )

 type HostOrDeviceSlice interface {
@@ -45,25 +45,25 @@ func (d DeviceSlice) IsOnDevice() bool {

 // TODO: change signature to be Malloc(element, numElements)
 // calc size internally
-func (d *DeviceSlice) Malloc(size, sizeOfElement int) (DeviceSlice, cr.CudaError) {
-	dp, err := cr.Malloc(uint(size))
+func (d *DeviceSlice) Malloc(size, sizeOfElement int) (DeviceSlice, cuda_runtime.CudaError) {
+	dp, err := cuda_runtime.Malloc(uint(size))
 	d.inner = dp
 	d.capacity = size
 	d.length = size / sizeOfElement
 	return *d, err
 }

-func (d *DeviceSlice) MallocAsync(size, sizeOfElement int, stream cr.CudaStream) (DeviceSlice, cr.CudaError) {
-	dp, err := cr.MallocAsync(uint(size), stream)
+func (d *DeviceSlice) MallocAsync(size, sizeOfElement int, stream cuda_runtime.CudaStream) (DeviceSlice, cuda_runtime.CudaError) {
+	dp, err := cuda_runtime.MallocAsync(uint(size), stream)
 	d.inner = dp
 	d.capacity = size
 	d.length = size / sizeOfElement
 	return *d, err
 }

-func (d *DeviceSlice) Free() cr.CudaError {
-	err := cr.Free(d.inner)
-	if err == cr.CudaSuccess {
+func (d *DeviceSlice) Free() cuda_runtime.CudaError {
+	err := cuda_runtime.Free(d.inner)
+	if err == cuda_runtime.CudaSuccess {
 		d.length, d.capacity = 0, 0
 		d.inner = nil
 	}
@@ -123,12 +123,12 @@ func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *Devic

 	// hostSrc := unsafe.Pointer(h.AsPointer())
 	hostSrc := unsafe.Pointer(&h[0])
-	cr.CopyToDevice(dst.inner, hostSrc, uint(size))
+	cuda_runtime.CopyToDevice(dst.inner, hostSrc, uint(size))
 	dst.length = h.Len()
 	return dst
 }

-func (h HostSlice[T]) CopyToDeviceAsync(dst *DeviceSlice, stream cr.CudaStream, shouldAllocate bool) *DeviceSlice {
+func (h HostSlice[T]) CopyToDeviceAsync(dst *DeviceSlice, stream cuda_runtime.CudaStream, shouldAllocate bool) *DeviceSlice {
 	size := h.Len() * h.SizeOfElement()
 	if shouldAllocate {
 		dst.MallocAsync(size, h.SizeOfElement(), stream)
@@ -138,7 +138,7 @@ func (h HostSlice[T]) CopyToDeviceAsync(dst *DeviceSlice, stream cr.CudaStream,
 	}

 	hostSrc := unsafe.Pointer(&h[0])
-	cr.CopyToDeviceAsync(dst.inner, hostSrc, uint(size), stream)
+	cuda_runtime.CopyToDeviceAsync(dst.inner, hostSrc, uint(size), stream)
 	dst.length = h.Len()
 	return dst
 }
@@ -148,13 +148,13 @@ func (h HostSlice[T]) CopyFromDevice(src *DeviceSlice) {
 		panic("destination and source slices have different lengths")
 	}
 	bytesSize := src.Len() * h.SizeOfElement()
-	cr.CopyFromDevice(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize))
+	cuda_runtime.CopyFromDevice(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize))
 }

-func (h HostSlice[T]) CopyFromDeviceAsync(src *DeviceSlice, stream cr.Stream) {
+func (h HostSlice[T]) CopyFromDeviceAsync(src *DeviceSlice, stream cuda_runtime.Stream) {
 	if h.Len() != src.Len() {
 		panic("destination and source slices have different lengths")
 	}
 	bytesSize := src.Len() * h.SizeOfElement()
-	cr.CopyFromDeviceAsync(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize), stream)
+	cuda_runtime.CopyFromDeviceAsync(unsafe.Pointer(&h[0]), src.inner, uint(bytesSize), stream)
 }
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -1,74 +0,0 @@
-package core
-
-import (
-	"fmt"
-
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-type VecOps int
-
-const (
-	Sub VecOps = iota
-	Add
-	Mul
-)
-
-type VecOpsConfig struct {
-	/*Details related to the device such as its id and stream. */
-	Ctx cr.DeviceContext
-	/* True if `a` is on device and false if it is not. Default value: false. */
-	isAOnDevice bool
-	/* True if `b` is on device and false if it is not. Default value: false. */
-	isBOnDevice bool
-	/* If true, output is preserved on device, otherwise on host. Default value: false. */
-	isResultOnDevice bool
-	/* True if `result` vector should be in Montgomery form and false otherwise. Default value: false. */
-	IsResultMontgomeryForm bool
-	/* Whether to run the vector operations asynchronously. If set to `true`, the function will be
-	*  non-blocking and you'll need to synchronize it explicitly by calling
-	*  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
-	IsAsync bool
-}
-
-/**
- * A function that returns the default value of [VecOpsConfig](@ref VecOpsConfig).
- * @return Default value of [VecOpsConfig](@ref VecOpsConfig).
- */
-func DefaultVecOpsConfig() VecOpsConfig {
-	ctx, _ := cr.GetDefaultDeviceContext()
-	config := VecOpsConfig{
-		ctx,   // ctx
-		false, // isAOnDevice
-		false, // isBOnDevice
-		false, // isResultOnDevice
-		false, // IsResultMontgomeryForm
-		false, // IsAsync
-	}
-
-	return config
-}
-
-func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) {
-	aLen, bLen, outLen := a.Len(), b.Len(), out.Len()
-	if aLen != bLen {
-		errorString := fmt.Sprintf(
-			"a and b vector lengths %d; %d are not equal",
-			aLen,
-			bLen,
-		)
-		panic(errorString)
-	}
-	if aLen != outLen {
-		errorString := fmt.Sprintf(
-			"a and out vector lengths %d; %d are not equal",
-			aLen,
-			outLen,
-		)
-		panic(errorString)
-	}
-
-	cfg.isAOnDevice = a.IsOnDevice()
-	cfg.isBOnDevice = b.IsOnDevice()
-	cfg.isResultOnDevice = out.IsOnDevice()
-}
--- a/wrappers/golang/core/vec_ops_test.go
+++ b/wrappers/golang/core/vec_ops_test.go
@@ -1,23 +0,0 @@
-package core
-
-import (
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-	"github.com/stretchr/testify/assert"
-	"testing"
-)
-
-func TestVecOpsDefaultConfig(t *testing.T) {
-	ctx, _ := cr.GetDefaultDeviceContext()
-	expected := VecOpsConfig{
-		ctx,   // Ctx
-		false, // isAOnDevice
-		false, // isBOnDevice
-		false, // isResultOnDevice
-		false, // IsResultMontgomeryForm
-		false, // IsAsync
-	}
-
-	actual := DefaultVecOpsConfig()
-
-	assert.Equal(t, expected, actual)
-}
--- a/wrappers/golang/curves/bls12377/include/vec_ops.h
+++ b/wrappers/golang/curves/bls12377/include/vec_ops.h
@@ -1,39 +0,0 @@
-#include <cuda_runtime.h>
-#include "../../include/types.h"
-
-#ifndef _BLS12_377_VEC_OPS_H
-#define _BLS12_377_VEC_OPS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-cudaError_t bls12_377MulCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bls12_377AddCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bls12_377SubCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/wrappers/golang/curves/bls12377/scalar_field.go
+++ b/wrappers/golang/curves/bls12377/scalar_field.go
@@ -6,7 +6,7 @@ import "C"
 import (
 	"encoding/binary"
 	"fmt"
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
+	core "github.com/ingonyama-zk/icicle/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"unsafe"
 )
@@ -85,7 +85,12 @@ func (f ScalarField) ToBytesLittleEndian() []byte {
 }

 func GenerateScalars(size int) core.HostSlice[ScalarField] {
-	scalarSlice := make(core.HostSlice[ScalarField], size)
+	scalars := make([]ScalarField, size)
+	for i := range scalars {
+		scalars[i] = ScalarField{}
+	}
+
+	scalarSlice := core.HostSliceFromElements[ScalarField](scalars)

 	cScalars := (*C.scalar_t)(unsafe.Pointer(&scalarSlice[0]))
 	cSize := (C.int)(size)
--- a/wrappers/golang/curves/bls12377/vec_ops.go
+++ b/wrappers/golang/curves/bls12377/vec_ops.go
@@ -1,49 +0,0 @@
-package bls12377
-
-// #cgo CFLAGS: -I./include/
-// #include "vec_ops.h"
-import "C"
-
-import (
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError) {
-	core.VecOpCheck(a, b, out, &config)
-	var cA, cB, cOut *C.scalar_t
-
-	if a.IsOnDevice() {
-		cA = (*C.scalar_t)(a.(core.DeviceSlice).AsPointer())
-	} else {
-		cA = (*C.scalar_t)(unsafe.Pointer(&a.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if b.IsOnDevice() {
-		cB = (*C.scalar_t)(b.(core.DeviceSlice).AsPointer())
-	} else {
-		cB = (*C.scalar_t)(unsafe.Pointer(&b.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if out.IsOnDevice() {
-		cOut = (*C.scalar_t)(out.(core.DeviceSlice).AsPointer())
-	} else {
-		cOut = (*C.scalar_t)(unsafe.Pointer(&out.(core.HostSlice[ScalarField])[0]))
-	}
-
-	cConfig := (*C.VecOpsConfig)(unsafe.Pointer(&config))
-	cSize := (C.int)(a.Len())
-
-	switch op {
-	case core.Sub:
-		ret = (cr.CudaError)(C.bls12_377SubCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Add:
-		ret = (cr.CudaError)(C.bls12_377AddCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Mul:
-		ret = (cr.CudaError)(C.bls12_377MulCuda(cA, cB, cSize, cConfig, cOut))
-	}
-
-	return ret
-}
--- a/wrappers/golang/curves/bls12377/vec_ops_test.go
+++ b/wrappers/golang/curves/bls12377/vec_ops_test.go
@@ -1,33 +0,0 @@
-package bls12377
-
-import (
-	"testing"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestVecOps(t *testing.T) {
-	testSize := 1 << 14
-
-	a := GenerateScalars(testSize)
-	b := GenerateScalars(testSize)
-	var scalar ScalarField
-	scalar.One()
-	ones := core.HostSliceWithValue(scalar, testSize)
-
-	out := make(core.HostSlice[ScalarField], testSize)
-	out2 := make(core.HostSlice[ScalarField], testSize)
-	out3 := make(core.HostSlice[ScalarField], testSize)
-
-	cfg := core.DefaultVecOpsConfig()
-
-	VecOp(a, b, out, cfg, core.Add)
-	VecOp(out, b, out2, cfg, core.Sub)
-
-	assert.Equal(t, a, out2)
-
-	VecOp(a, ones, out3, cfg, core.Mul)
-
-	assert.Equal(t, a, out3)
-}
--- a/wrappers/golang/curves/bls12381/include/vec_ops.h
+++ b/wrappers/golang/curves/bls12381/include/vec_ops.h
@@ -1,39 +0,0 @@
-#include <cuda_runtime.h>
-#include "../../include/types.h"
-
-#ifndef _BLS12_381_VEC_OPS_H
-#define _BLS12_381_VEC_OPS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-cudaError_t bls12_381MulCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bls12_381AddCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bls12_381SubCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/wrappers/golang/curves/bls12381/scalar_field.go
+++ b/wrappers/golang/curves/bls12381/scalar_field.go
@@ -6,7 +6,7 @@ import "C"
 import (
 	"encoding/binary"
 	"fmt"
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
+	core "github.com/ingonyama-zk/icicle/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"unsafe"
 )
@@ -85,7 +85,12 @@ func (f ScalarField) ToBytesLittleEndian() []byte {
 }

 func GenerateScalars(size int) core.HostSlice[ScalarField] {
-	scalarSlice := make(core.HostSlice[ScalarField], size)
+	scalars := make([]ScalarField, size)
+	for i := range scalars {
+		scalars[i] = ScalarField{}
+	}
+
+	scalarSlice := core.HostSliceFromElements[ScalarField](scalars)

 	cScalars := (*C.scalar_t)(unsafe.Pointer(&scalarSlice[0]))
 	cSize := (C.int)(size)
--- a/wrappers/golang/curves/bls12381/vec_ops.go
+++ b/wrappers/golang/curves/bls12381/vec_ops.go
@@ -1,49 +0,0 @@
-package bls12381
-
-// #cgo CFLAGS: -I./include/
-// #include "vec_ops.h"
-import "C"
-
-import (
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError) {
-	core.VecOpCheck(a, b, out, &config)
-	var cA, cB, cOut *C.scalar_t
-
-	if a.IsOnDevice() {
-		cA = (*C.scalar_t)(a.(core.DeviceSlice).AsPointer())
-	} else {
-		cA = (*C.scalar_t)(unsafe.Pointer(&a.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if b.IsOnDevice() {
-		cB = (*C.scalar_t)(b.(core.DeviceSlice).AsPointer())
-	} else {
-		cB = (*C.scalar_t)(unsafe.Pointer(&b.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if out.IsOnDevice() {
-		cOut = (*C.scalar_t)(out.(core.DeviceSlice).AsPointer())
-	} else {
-		cOut = (*C.scalar_t)(unsafe.Pointer(&out.(core.HostSlice[ScalarField])[0]))
-	}
-
-	cConfig := (*C.VecOpsConfig)(unsafe.Pointer(&config))
-	cSize := (C.int)(a.Len())
-
-	switch op {
-	case core.Sub:
-		ret = (cr.CudaError)(C.bls12_381SubCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Add:
-		ret = (cr.CudaError)(C.bls12_381AddCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Mul:
-		ret = (cr.CudaError)(C.bls12_381MulCuda(cA, cB, cSize, cConfig, cOut))
-	}
-
-	return ret
-}
--- a/wrappers/golang/curves/bls12381/vec_ops_test.go
+++ b/wrappers/golang/curves/bls12381/vec_ops_test.go
@@ -1,33 +0,0 @@
-package bls12381
-
-import (
-	"testing"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestVecOps(t *testing.T) {
-	testSize := 1 << 14
-
-	a := GenerateScalars(testSize)
-	b := GenerateScalars(testSize)
-	var scalar ScalarField
-	scalar.One()
-	ones := core.HostSliceWithValue(scalar, testSize)
-
-	out := make(core.HostSlice[ScalarField], testSize)
-	out2 := make(core.HostSlice[ScalarField], testSize)
-	out3 := make(core.HostSlice[ScalarField], testSize)
-
-	cfg := core.DefaultVecOpsConfig()
-
-	VecOp(a, b, out, cfg, core.Add)
-	VecOp(out, b, out2, cfg, core.Sub)
-
-	assert.Equal(t, a, out2)
-
-	VecOp(a, ones, out3, cfg, core.Mul)
-
-	assert.Equal(t, a, out3)
-}
--- a/wrappers/golang/curves/bn254/include/ve_mod_mult.h
+++ b/wrappers/golang/curves/bn254/include/ve_mod_mult.h
@@ -0,0 +1,70 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdbool.h>
+// ve_mod_mult.h
+
+#ifndef _BN254_VEC_MULT_H
+#define _BN254_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  cudaStream_t stream;   /**< Stream to use. Default value: 0. */
+  int device_id;         /**< Index of the currently used GPU. Default value: 0. */
+  cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
+} DeviceContext;
+
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int bn254AddCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
+  int n,
+  bool is_on_device,
+  DeviceContext ctx,
+  BN254_scalar_t* result
+);
+
+int bn254SubCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
+  int n,
+  bool is_on_device,
+  DeviceContext ctx,
+  BN254_scalar_t* result
+);
+
+int bn254MulCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
+  int n,
+  bool is_on_device,
+  bool is_montgomery,
+  DeviceContext ctx,
+  BN254_scalar_t* result
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_VEC_MULT_H */
--- a/wrappers/golang/curves/bn254/include/vec_ops.h
+++ b/wrappers/golang/curves/bn254/include/vec_ops.h
@@ -1,39 +0,0 @@
-#include <cuda_runtime.h>
-#include "../../include/types.h"
-
-#ifndef _BN254_VEC_OPS_H
-#define _BN254_VEC_OPS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-cudaError_t bn254MulCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bn254AddCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bn254SubCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/wrappers/golang/curves/bn254/scalar_field.go
+++ b/wrappers/golang/curves/bn254/scalar_field.go
@@ -6,7 +6,7 @@ import "C"
 import (
 	"encoding/binary"
 	"fmt"
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
+	core "github.com/ingonyama-zk/icicle/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"unsafe"
 )
@@ -85,7 +85,12 @@ func (f ScalarField) ToBytesLittleEndian() []byte {
 }

 func GenerateScalars(size int) core.HostSlice[ScalarField] {
-	scalarSlice := make(core.HostSlice[ScalarField], size)
+	scalars := make([]ScalarField, size)
+	for i := range scalars {
+		scalars[i] = ScalarField{}
+	}
+
+	scalarSlice := core.HostSliceFromElements[ScalarField](scalars)

 	cScalars := (*C.scalar_t)(unsafe.Pointer(&scalarSlice[0]))
 	cSize := (C.int)(size)
--- a/wrappers/golang/curves/bn254/vec_ops.go
+++ b/wrappers/golang/curves/bn254/vec_ops.go
@@ -1,49 +0,0 @@
-package bn254
-
-// #cgo CFLAGS: -I./include/
-// #include "vec_ops.h"
-import "C"
-
-import (
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError) {
-	core.VecOpCheck(a, b, out, &config)
-	var cA, cB, cOut *C.scalar_t
-
-	if a.IsOnDevice() {
-		cA = (*C.scalar_t)(a.(core.DeviceSlice).AsPointer())
-	} else {
-		cA = (*C.scalar_t)(unsafe.Pointer(&a.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if b.IsOnDevice() {
-		cB = (*C.scalar_t)(b.(core.DeviceSlice).AsPointer())
-	} else {
-		cB = (*C.scalar_t)(unsafe.Pointer(&b.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if out.IsOnDevice() {
-		cOut = (*C.scalar_t)(out.(core.DeviceSlice).AsPointer())
-	} else {
-		cOut = (*C.scalar_t)(unsafe.Pointer(&out.(core.HostSlice[ScalarField])[0]))
-	}
-
-	cConfig := (*C.VecOpsConfig)(unsafe.Pointer(&config))
-	cSize := (C.int)(a.Len())
-
-	switch op {
-	case core.Sub:
-		ret = (cr.CudaError)(C.bn254SubCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Add:
-		ret = (cr.CudaError)(C.bn254AddCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Mul:
-		ret = (cr.CudaError)(C.bn254MulCuda(cA, cB, cSize, cConfig, cOut))
-	}
-
-	return ret
-}
--- a/wrappers/golang/curves/bn254/vec_ops_test.go
+++ b/wrappers/golang/curves/bn254/vec_ops_test.go
@@ -1,33 +0,0 @@
-package bn254
-
-import (
-	"testing"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestVecOps(t *testing.T) {
-	testSize := 1 << 14
-
-	a := GenerateScalars(testSize)
-	b := GenerateScalars(testSize)
-	var scalar ScalarField
-	scalar.One()
-	ones := core.HostSliceWithValue(scalar, testSize)
-
-	out := make(core.HostSlice[ScalarField], testSize)
-	out2 := make(core.HostSlice[ScalarField], testSize)
-	out3 := make(core.HostSlice[ScalarField], testSize)
-
-	cfg := core.DefaultVecOpsConfig()
-
-	VecOp(a, b, out, cfg, core.Add)
-	VecOp(out, b, out2, cfg, core.Sub)
-
-	assert.Equal(t, a, out2)
-
-	VecOp(a, ones, out3, cfg, core.Mul)
-
-	assert.Equal(t, a, out3)
-}
--- a/wrappers/golang/curves/bw6761/include/vec_ops.h
+++ b/wrappers/golang/curves/bw6761/include/vec_ops.h
@@ -1,39 +0,0 @@
-#include <cuda_runtime.h>
-#include "../../include/types.h"
-
-#ifndef _BW6_761_VEC_OPS_H
-#define _BW6_761_VEC_OPS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-cudaError_t bw6_761MulCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bw6_761AddCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-cudaError_t bw6_761SubCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
-  int n,
-  VecOpsConfig* config,
-  scalar_t* result
-);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/wrappers/golang/curves/bw6761/scalar_field.go
+++ b/wrappers/golang/curves/bw6761/scalar_field.go
@@ -6,7 +6,7 @@ import "C"
 import (
 	"encoding/binary"
 	"fmt"
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
+	core "github.com/ingonyama-zk/icicle/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"unsafe"
 )
@@ -85,7 +85,12 @@ func (f ScalarField) ToBytesLittleEndian() []byte {
 }

 func GenerateScalars(size int) core.HostSlice[ScalarField] {
-	scalarSlice := make(core.HostSlice[ScalarField], size)
+	scalars := make([]ScalarField, size)
+	for i := range scalars {
+		scalars[i] = ScalarField{}
+	}
+
+	scalarSlice := core.HostSliceFromElements[ScalarField](scalars)

 	cScalars := (*C.scalar_t)(unsafe.Pointer(&scalarSlice[0]))
 	cSize := (C.int)(size)
--- a/wrappers/golang/curves/bw6761/vec_ops.go
+++ b/wrappers/golang/curves/bw6761/vec_ops.go
@@ -1,49 +0,0 @@
-package bw6761
-
-// #cgo CFLAGS: -I./include/
-// #include "vec_ops.h"
-import "C"
-
-import (
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError) {
-	core.VecOpCheck(a, b, out, &config)
-	var cA, cB, cOut *C.scalar_t
-
-	if a.IsOnDevice() {
-		cA = (*C.scalar_t)(a.(core.DeviceSlice).AsPointer())
-	} else {
-		cA = (*C.scalar_t)(unsafe.Pointer(&a.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if b.IsOnDevice() {
-		cB = (*C.scalar_t)(b.(core.DeviceSlice).AsPointer())
-	} else {
-		cB = (*C.scalar_t)(unsafe.Pointer(&b.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if out.IsOnDevice() {
-		cOut = (*C.scalar_t)(out.(core.DeviceSlice).AsPointer())
-	} else {
-		cOut = (*C.scalar_t)(unsafe.Pointer(&out.(core.HostSlice[ScalarField])[0]))
-	}
-
-	cConfig := (*C.VecOpsConfig)(unsafe.Pointer(&config))
-	cSize := (C.int)(a.Len())
-
-	switch op {
-	case core.Sub:
-		ret = (cr.CudaError)(C.bw6_761SubCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Add:
-		ret = (cr.CudaError)(C.bw6_761AddCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Mul:
-		ret = (cr.CudaError)(C.bw6_761MulCuda(cA, cB, cSize, cConfig, cOut))
-	}
-
-	return ret
-}
--- a/wrappers/golang/curves/bw6761/vec_ops_test.go
+++ b/wrappers/golang/curves/bw6761/vec_ops_test.go
@@ -1,33 +0,0 @@
-package bw6761
-
-import (
-	"testing"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestVecOps(t *testing.T) {
-	testSize := 1 << 14
-
-	a := GenerateScalars(testSize)
-	b := GenerateScalars(testSize)
-	var scalar ScalarField
-	scalar.One()
-	ones := core.HostSliceWithValue(scalar, testSize)
-
-	out := make(core.HostSlice[ScalarField], testSize)
-	out2 := make(core.HostSlice[ScalarField], testSize)
-	out3 := make(core.HostSlice[ScalarField], testSize)
-
-	cfg := core.DefaultVecOpsConfig()
-
-	VecOp(a, b, out, cfg, core.Add)
-	VecOp(out, b, out2, cfg, core.Sub)
-
-	assert.Equal(t, a, out2)
-
-	VecOp(a, ones, out3, cfg, core.Mul)
-
-	assert.Equal(t, a, out3)
-}
--- a/wrappers/golang/curves/include/types.h
+++ b/wrappers/golang/curves/include/types.h
@@ -1,5 +1,8 @@
 #include <cuda_runtime.h>

+// #define G2_DEFINED
+// #include "../../../../../icicle/curves/curve_config.cuh"
+
 #ifndef _TYPES_H
 #define _TYPES_H

@@ -7,6 +10,13 @@
 extern "C" {
 #endif

+// typedef curve_config::scalar_t scalar_t;
+// typedef curve_config::projective_t projective_t;
+// typedef curve_config::g2_projective_t g2_projective_t;
+// typedef curve_config::affine_t affine_t;
+// typedef curve_config::g2_affine_t g2_affine_t;
+
+// typedef struct uint32 unsigned long int;
 typedef struct scalar_t scalar_t;
 typedef struct projective_t projective_t;
 typedef struct g2_projective_t g2_projective_t;
@@ -15,7 +25,6 @@ typedef struct g2_affine_t g2_affine_t;

 typedef struct MSMConfig MSMConfig;
 typedef struct NTTConfig NTTConfig;
-typedef struct VecOpsConfig VecOpsConfig;
 typedef struct DeviceContext DeviceContext;

 typedef cudaError_t cudaError_t;
--- a/wrappers/golang/internal/generator/main.go
+++ b/wrappers/golang/internal/generator/main.go
@@ -104,8 +104,7 @@ func generateFiles() {
 		"ntt_test.go.tmpl",
 		"curve_test.go.tmpl",
 		"curve.go.tmpl",
-		"vec_ops_test.go.tmpl",
-		"vec_ops.go.tmpl",
+		/* "vec_ops.h.tmpl,"*/
 		"helpers_test.go.tmpl",
 	}

@@ -172,7 +171,7 @@ func generateFiles() {
 		"msm.h.tmpl",
 		"g2_msm.h.tmpl",
 		"ntt.h.tmpl",
-		"vec_ops.h.tmpl",
+		/*"vec_ops.h.tmpl",*/
 	}

 	for _, includeFile := range templateIncludeFiles {
--- a/wrappers/golang/internal/generator/templates/include/vec_ops.h.tmpl
+++ b/wrappers/golang/internal/generator/templates/include/vec_ops.h.tmpl
@@ -1,35 +1,49 @@
+#include <cuda.h>
 #include <cuda_runtime.h>
-#include "../../include/types.h"
+#include <stdbool.h>
+// ve_mod_mult.h

-#ifndef _{{toUpper .Curve}}_VEC_OPS_H
-#define _{{toUpper .Curve}}_VEC_OPS_H
+#ifndef _BN254_VEC_MULT_H
+#define _BN254_VEC_MULT_H

 #ifdef __cplusplus
 extern "C" {
 #endif

-cudaError_t {{.Curve}}MulCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
+typedef struct {
+  cudaStream_t stream;   /**< Stream to use. Default value: 0. */
+  int device_id;         /**< Index of the currently used GPU. Default value: 0. */
+  cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
+} DeviceContext;
+
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int bn254AddCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
  int n,
-  VecOpsConfig* config,
-  scalar_t* result
+  bool is_on_device,
+  DeviceContext ctx,
+  BN254_scalar_t* result
 );

-cudaError_t {{.Curve}}AddCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
+int bn254SubCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
  int n,
-  VecOpsConfig* config,
-  scalar_t* result
+  bool is_on_device,
+  DeviceContext ctx,
+  BN254_scalar_t* result
 );

-cudaError_t {{.Curve}}SubCuda(
-  scalar_t* vec_a,
-  scalar_t* vec_b,
+int bn254MulCuda(
+  BN254_scalar_t* vec_a,
+  BN254_scalar_t* vec_b,
  int n,
-  VecOpsConfig* config,
-  scalar_t* result
+  bool is_on_device,
+  bool is_montgomery,
+  DeviceContext ctx,
+  BN254_scalar_t* result
 );

 #ifdef __cplusplus
--- a/wrappers/golang/internal/generator/templates/scalar_field.go.tmpl
+++ b/wrappers/golang/internal/generator/templates/scalar_field.go.tmpl
@@ -5,14 +5,19 @@ import "C"
 {{- end }}

 {{- define "scalar_field_go_imports" }}
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
+	core "github.com/ingonyama-zk/icicle/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
 	"unsafe"
 {{- end }}

 {{- define "scalar_field_funcs" }}
 func GenerateScalars(size int) core.HostSlice[ScalarField] {
-	scalarSlice := make(core.HostSlice[ScalarField], size)
+	scalars := make([]ScalarField, size)
+	for i := range scalars {
+		scalars[i] = ScalarField{}
+	}
+
+	scalarSlice := core.HostSliceFromElements[ScalarField](scalars)

 	cScalars := (*C.scalar_t)(unsafe.Pointer(&scalarSlice[0]))
 	cSize := (C.int)(size)
--- a/wrappers/golang/internal/generator/templates/vec_ops.go.tmpl
+++ b/wrappers/golang/internal/generator/templates/vec_ops.go.tmpl
@@ -1,49 +0,0 @@
-package {{.PackageName}}
-
-// #cgo CFLAGS: -I./include/
-// #include "vec_ops.h"
-import "C"
-
-import (
-	"unsafe"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	cr "github.com/ingonyama-zk/icicle/wrappers/golang/cuda_runtime"
-)
-
-func VecOp(a, b, out core.HostOrDeviceSlice, config core.VecOpsConfig, op core.VecOps) (ret cr.CudaError) {
-	core.VecOpCheck(a, b, out, &config)
-	var cA, cB, cOut *C.scalar_t
-
-	if a.IsOnDevice() {
-		cA = (*C.scalar_t)(a.(core.DeviceSlice).AsPointer())
-	} else {
-		cA = (*C.scalar_t)(unsafe.Pointer(&a.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if b.IsOnDevice() {
-		cB = (*C.scalar_t)(b.(core.DeviceSlice).AsPointer())
-	} else {
-		cB = (*C.scalar_t)(unsafe.Pointer(&b.(core.HostSlice[ScalarField])[0]))
-	}
-
-	if out.IsOnDevice() {
-		cOut = (*C.scalar_t)(out.(core.DeviceSlice).AsPointer())
-	} else {
-		cOut = (*C.scalar_t)(unsafe.Pointer(&out.(core.HostSlice[ScalarField])[0]))
-	}
-
-	cConfig := (*C.VecOpsConfig)(unsafe.Pointer(&config))
-	cSize := (C.int)(a.Len())
-
-	switch op {
-	case core.Sub:
-		ret = (cr.CudaError)(C.{{.Curve}}SubCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Add:
-		ret = (cr.CudaError)(C.{{.Curve}}AddCuda(cA, cB, cSize, cConfig, cOut))
-	case core.Mul:
-		ret = (cr.CudaError)(C.{{.Curve}}MulCuda(cA, cB, cSize, cConfig, cOut))
-	}
-
-	return ret
-}
--- a/wrappers/golang/internal/generator/templates/vec_ops_test.go.tmpl
+++ b/wrappers/golang/internal/generator/templates/vec_ops_test.go.tmpl
@@ -1,33 +0,0 @@
-package {{.PackageName}}
-
-import (
-	"testing"
-
-	"github.com/ingonyama-zk/icicle/wrappers/golang/core"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestVecOps(t *testing.T) {
-	testSize := 1 << 14
-
-	a := GenerateScalars(testSize)
-	b := GenerateScalars(testSize)
-	var scalar ScalarField
-	scalar.One()
-	ones := core.HostSliceWithValue(scalar, testSize)
-
-	out := make(core.HostSlice[ScalarField], testSize)
-	out2 := make(core.HostSlice[ScalarField], testSize)
-	out3 := make(core.HostSlice[ScalarField], testSize)
-
-	cfg := core.DefaultVecOpsConfig()
-
-	VecOp(a, b, out, cfg, core.Add)
-	VecOp(out, b, out2, cfg, core.Sub)
-
-	assert.Equal(t, a, out2)
-
-	VecOp(a, ones, out3, cfg, core.Mul)
-
-	assert.Equal(t, a, out3)
-}
--- a/wrappers/rust/Cargo.toml
+++ b/wrappers/rust/Cargo.toml
@@ -9,17 +9,3 @@ members = [
  "icicle-curves/icicle-bn254",
  "icicle-curves/icicle-grumpkin",
 ]
-exclude = [
-  "icicle-curves/icicle-curve-template",
-]
-
-[workspace.package]
-version = "1.6.0"
-edition = "2021"
-authors = [ "Ingonyama" ]
-homepage = "https://www.ingonyama.com"
-repository = "https://github.com/ingonyama-zk/icicle"
-
-[workspace.dependencies]
-icicle-core = { path = "icicle-core" }
-icicle-cuda-runtime = { path = "icicle-cuda-runtime" }
--- a/wrappers/rust/icicle-core/Cargo.toml
+++ b/wrappers/rust/icicle-core/Cargo.toml
@@ -1,16 +1,16 @@
 [package]
 name = "icicle-core"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
+version = "1.4.0"
+edition = "2021"
+authors = ["Ingonyama"]
 description = "A library for GPU ZK acceleration by Ingonyama"
-homepage.workspace = true
-repository.workspace = true
+homepage = "https://www.ingonyama.com"
+repository = "https://github.com/ingonyama-zk/icicle"


 [dependencies]

-icicle-cuda-runtime = { workspace = true }
+icicle-cuda-runtime = { path = "../icicle-cuda-runtime" }

 ark-ff = { version = "0.4.0", optional = true }
 ark-ec = { version = "0.4.0", optional = true, features = ["parallel"] }
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -1,7 +1,7 @@
 use crate::curve::{Affine, Curve, Projective};
 use crate::msm::{msm, MSMConfig, MSM};
 use crate::traits::{FieldImpl, GenerateRandom};
-use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
+use icicle_cuda_runtime::device::{get_device_count, set_device};
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use icicle_cuda_runtime::stream::CudaStream;
 use rayon::iter::IntoParallelIterator;
@@ -108,8 +108,6 @@ where
 {
    let test_sizes = [1000, 1 << 16];
    let batch_sizes = [1, 3, 1 << 4];
-    let stream = CudaStream::create().unwrap();
-    warmup(&stream).unwrap();
    for test_size in test_sizes {
        for batch_size in batch_sizes {
            let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -125,6 +123,7 @@ where
            let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
            let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
+            let stream = CudaStream::create().unwrap();
            points_d
                .copy_from_host_async(&points_cloned, &stream)
                .unwrap();
@@ -148,6 +147,9 @@ where
            stream
                .synchronize()
                .unwrap();
+            stream
+                .destroy()
+                .unwrap();

            let points_ark: Vec<_> = points_h
                .as_slice()
@@ -170,9 +172,6 @@ where
            }
        }
    }
-    stream
-        .destroy()
-        .unwrap();
 }

 pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()
--- a/wrappers/rust/icicle-core/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon/mod.rs
@@ -293,24 +293,17 @@ macro_rules! impl_poseidon {

 #[macro_export]
 macro_rules! impl_poseidon_tests {
-    (
-      $field:ident
-    ) => {
-        #[test]
-        fn test_poseidon_hash_many() {
-            check_poseidon_hash_many::<$field>()
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! impl_poseidon_custom_config_test {
    (
      $field:ident,
      $field_bytes:literal,
      $field_prefix:literal,
      $partial_rounds:literal
    ) => {
+        #[test]
+        fn test_poseidon_hash_many() {
+            check_poseidon_hash_many::<$field>()
+        }
+
        #[test]
        fn test_poseidon_custom_config() {
            check_poseidon_custom_config::<$field>($field_bytes, $field_prefix, $partial_rounds)
--- a/wrappers/rust/icicle-cuda-runtime/Cargo.toml
+++ b/wrappers/rust/icicle-cuda-runtime/Cargo.toml
@@ -1,15 +1,15 @@
 [package]
 name = "icicle-cuda-runtime"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
+version = "1.4.0"
+edition = "2021"
+authors = [ "Ingonyama" ]
 description = "Ingonyama's Rust wrapper of CUDA runtime"
-homepage.workspace = true
-repository.workspace = true
+homepage = "https://www.ingonyama.com"
+repository = "https://github.com/ingonyama-zk/icicle"
 rust-version = "1.70.0"

 [dependencies]
 bitflags = "1.3"

 [build-dependencies]
-bindgen = "0.69.4"
+bindgen = "*"
--- a/wrappers/rust/icicle-cuda-runtime/build.rs
+++ b/wrappers/rust/icicle-cuda-runtime/build.rs
@@ -27,11 +27,6 @@ fn cuda_lib_path() -> &'static str {
 }

 fn main() {
-    #[cfg(not(any(target_os = "windows", target_os = "linux")))]
-    {
-        panic!("Currently, ICICLE can only be built for Windows or Linux")
-    }
-
    let cuda_runtime_api_path = PathBuf::from(cuda_include_path())
        .join("cuda_runtime_api.h")
        .to_string_lossy()
@@ -77,7 +72,6 @@ fn main() {
        .allowlist_function("cudaMemset")
        .allowlist_function("cudaMemsetAsync")
        .allowlist_function("cudaDeviceGetDefaultMemPool")
-        .allowlist_function("cudaMemGetInfo")
        .rustified_enum("cudaMemcpyKind")
        // Stream Ordered Memory Allocator
        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
--- a/wrappers/rust/icicle-cuda-runtime/src/device.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/device.rs
@@ -1,9 +1,7 @@
 use crate::{
-    bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
+    bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
    error::{CudaResult, CudaResultWrap},
-    stream::CudaStream,
 };
-use std::mem::MaybeUninit;

 pub fn set_device(device_id: usize) -> CudaResult<()> {
    unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -18,16 +16,3 @@ pub fn get_device() -> CudaResult<usize> {
    let mut device_id = 0;
    unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
 }
-
-// This function pre-allocates default memory pool and warms the GPU up
-// so that subsequent memory allocations and other calls are not slowed down
-pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
-    let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
-    let mut free_memory: usize = 0;
-    let mut _total_memory: usize = 0;
-    unsafe {
-        cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
-        cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
-        cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
-    }
-}
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -47,18 +47,14 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {

    pub fn as_mut_slice(&mut self) -> &mut [T] {
        match self {
-            Self::Device(_, _) => {
-                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
-            }
+            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
            Self::Host(v) => v.as_mut_slice(),
        }
    }

    pub fn as_slice(&self) -> &[T] {
        match self {
-            Self::Device(_, _) => {
-                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
-            }
+            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
            Self::Host(v) => v.as_slice(),
        }
    }
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
@@ -1,15 +1,15 @@
 [package]
 name = "icicle-bls12-377"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
+version = "1.4.0"
+edition = "2021"
+authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BLS12-377 pairing friendly elliptic curve by Ingonyama"
-homepage.workspace = true
-repository.workspace = true
+homepage = "https://www.ingonyama.com"
+repository = "https://github.com/ingonyama-zk/icicle"

 [dependencies]
-icicle-core = { workspace = true }
-icicle-cuda-runtime = { workspace = true }
+icicle-core = { path = "../../icicle-core" }
+icicle-cuda-runtime = { path = "../../icicle-cuda-runtime" }
 ark-bls12-377 = { version = "0.4.0", optional = true }

 [build-dependencies]
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/poseidon/mod.rs
@@ -20,9 +20,8 @@ impl_poseidon!("bw6_761", bw6_761, BaseField, BaseCfg);
 #[cfg(test)]
 pub(crate) mod tests {
    use crate::curve::ScalarField;
+    use icicle_core::impl_poseidon_tests;
    use icicle_core::poseidon::tests::*;
-    use icicle_core::{impl_poseidon_custom_config_test, impl_poseidon_tests};

-    impl_poseidon_tests!(ScalarField);
-    impl_poseidon_custom_config_test!(ScalarField, 32, "bls12_377", 56);
+    impl_poseidon_tests!(ScalarField, 32, "bls12_377", 56);
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
@@ -1,15 +1,15 @@
 [package]
 name = "icicle-bls12-381"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
+version = "1.4.0"
+edition = "2021"
+authors = [ "Ingonyama" ]
 description = "Rust wrapper for the CUDA implementation of BLS12-381 pairing friendly elliptic curve by Ingonyama"
-homepage.workspace = true
-repository.workspace = true
+homepage = "https://www.ingonyama.com"
+repository = "https://github.com/ingonyama-zk/icicle"

 [dependencies]
-icicle-core = { workspace = true }
-icicle-cuda-runtime = { workspace = true }
+icicle-core = { path = "../../icicle-core" }
+icicle-cuda-runtime = { path = "../../icicle-cuda-runtime" }
 ark-bls12-381 = { version = "0.4.0", optional = true }

 [build-dependencies]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ImmanuelSegol	b5364c24dd	refactor	2024-02-28 11:41:09 -04:00
ImmanuelSegol	c2b73aee8d	refactor	2024-02-28 11:37:25 -04:00
ImmanuelSegol	49663d89d3	refactor	2024-02-28 11:29:29 -04:00
ImmanuelSegol	dd509f095b	refactor	2024-02-28 11:27:33 -04:00
ImmanuelSegol	9449ffd7cb	refactor	2024-02-28 11:19:59 -04:00