session 4 start

session 3 start
2026-05-01 03:00:14 -04:00 · 2024-07-17 12:06:13 +03:00 · 2024-07-10 14:37:16 +03:00 · 2024-07-10 10:50:53 +03:00 · 2024-07-03 16:37:57 +03:00 · 2024-07-03 12:09:10 +03:00
197 changed files with 5387 additions and 1070 deletions
--- a/.github/changed-files.yml
+++ b/.github/changed-files.yml
@@ -3,8 +3,11 @@ golang:
  - wrappers/golang/**/*.h
  - wrappers/golang/**/*.tmpl
  - go.mod
+  - .github/workflows/golang.yml
 rust:
  - wrappers/rust/**/*
+  - '!wrappers/rust/README.md'
+  - .github/workflows/rust.yml
 cpp:
  - icicle/**/*.cu
  - icicle/**/*.cuh
@@ -12,4 +15,11 @@ cpp:
  - icicle/**/*.hpp
  - icicle/**/*.c
  - icicle/**/*.h
-  - icicle/CMakeLists.txt
+  - icicle/CMakeLists.txt
+  - .github/workflows/cpp_cuda.yml  
+  - icicle/cmake/Common.cmake
+  - icicle/cmake/CurvesCommon.cmake
+  - icicle/cmake/FieldsCommon.cmake
+examples:
+  - examples/**/*
+  - .github/workflows/examples.yml
--- a/.github/workflows/check-changed-files.yml
+++ b/.github/workflows/check-changed-files.yml
@@ -12,6 +12,9 @@ on:
      cpp_cuda:
        description: "Flag for if C++/CUDA files changed"
        value: ${{ jobs.check-changed-files.outputs.cpp_cuda }}
+      examples:
+        description: "Flag for if example files changed"
+        value: ${{ jobs.check-changed-files.outputs.examples }}

 jobs:
  check-changed-files:
@@ -21,6 +24,7 @@ jobs:
      golang: ${{ steps.changed_files.outputs.golang }}
      rust: ${{ steps.changed_files.outputs.rust }}
      cpp_cuda: ${{ steps.changed_files.outputs.cpp_cuda }}
+      examples: ${{ steps.changed_files.outputs.examples }}
    steps:
    - name: Checkout Repo
      uses: actions/checkout@v4
@@ -37,3 +41,4 @@ jobs:
        echo "golang=${{ steps.changed-files-yaml.outputs.golang_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "rust=${{ steps.changed-files-yaml.outputs.rust_any_modified }}" >> "$GITHUB_OUTPUT"
        echo "cpp_cuda=${{ steps.changed-files-yaml.outputs.cpp_any_modified }}" >> "$GITHUB_OUTPUT"
+        echo "examples=${{ steps.changed-files-yaml.outputs.examples_any_modified }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/cpp_cuda.yml
+++ b/.github/workflows/cpp_cuda.yml
@@ -55,7 +55,7 @@ jobs:
      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
      run: |
        mkdir -p build && rm -rf build/*
-        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.field.build_args }} -S . -B build
+        cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} -S . -B build
        cmake --build build -j
    - name: Run C++ curve Tests
      working-directory: ./icicle/build/tests
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -33,7 +33,7 @@ jobs:
      uses: actions/checkout@v4
    - name: c++ examples
      working-directory: ./examples/c++
-      if: needs.check-changed-files.outputs.cpp_cuda == 'true'
+      if: needs.check-changed-files.outputs.cpp_cuda == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
@@ -47,7 +47,7 @@ jobs:
        done    
    - name: Rust examples
      working-directory: ./examples/rust
-      if: needs.check-changed-files.outputs.rust == 'true'
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.examples == 'true'
      run: |
        # loop over all directories in the current directory
        for dir in $(find . -mindepth 1 -maxdepth 1 -type d); do
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -79,26 +79,27 @@ jobs:
        cargo test --release --verbose -- --ignored
        cargo test --release --verbose

-  build-windows:
-    name: Build on Windows
-    runs-on: windows-2022
-    needs: check-changed-files
-    steps:     
-    - name: Checkout Repo
-      uses: actions/checkout@v4
-    - name: Download and Install Cuda
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.11
-      with:
-        cuda: '12.0.0'
-        method: 'network'
-        # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
-        sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
-    - name: Build targets
-      working-directory: ./wrappers/rust
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
-      env:
-        CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
-      # Building from the root workspace will build all members of the workspace by default
-      run: cargo build --release --verbose
+  # build-windows:
+  #   name: Build on Windows
+  #   runs-on: windows-2022
+  #   needs: check-changed-files
+  #   steps:     
+  #   - name: Checkout Repo
+  #     uses: actions/checkout@v4
+  #   - name: Download and Install Cuda
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     id: cuda-toolkit
+  #     uses: Jimver/cuda-toolkit@v0.2.11
+  #     with:
+  #       cuda: '12.0.0'
+  #       method: 'network'
+  #       # https://docs.nvidia.com/cuda/archive/12.0.0/cuda-installation-guide-microsoft-windows/index.html
+  #       sub-packages: '["cudart", "nvcc", "thrust", "visual_studio_integration"]'
+  #   - name: Build targets
+  #     working-directory: ./wrappers/rust
+  #     if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
+  #     env:
+  #       CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+  #       CUDA_ARCH: 50 # Using CUDA_ARCH=50 env variable since the CI machines have no GPUs
+  #     # Building from the root workspace will build all members of the workspace by default
+  #     run: cargo build --release --verbose
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -5,7 +5,7 @@ on:
    branches:
      - main
    paths:
-      - 'docs/*'
+      - 'docs/**'

 jobs:
  test-deploy:
--- a/docs/docs/icicle/golang-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/golang-bindings/msm-pre-computation.md
@@ -4,9 +4,9 @@ To understand the theory behind MSM pre computation technique refer to Niall Emm

 ## Core package

-### MSM PrecomputeBases
+### MSM PrecomputePoints

-`PrecomputeBases` and `G2PrecomputeBases` exists for all supported curves.
+`PrecomputePoints` and `G2PrecomputePoints` exists for all supported curves.

 #### Description

@@ -14,21 +14,20 @@ This function extends each provided base point $(P)$ with its multiples $(2^lP,

 The precomputation process is crucial for optimizing MSM operations, especially when dealing with large sets of points and scalars. By precomputing and storing multiples of the base points, the MSM function can more efficiently compute the scalar-point multiplications.

-#### `PrecomputeBases`
+#### `PrecomputePoints`

-Precomputes bases for MSM by extending each base point with its multiples.
+Precomputes points for MSM by extending each base point with its multiples.

 ```go
-func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
 ```

 ##### Parameters

 - **`points`**: A slice of the original affine points to be extended with their multiples.
- **`precomputeFactor`**: Determines the total number of points to precompute for each base point.
- **`c`**: Currently unused; reserved for future compatibility.
- **`ctx`**: CUDA device context specifying the execution environment.
- **`outputBases`**: The device slice allocated for storing the extended bases.
+- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
+- **`outputBases`**: The device slice allocated for storing the extended points.

 ##### Example

@@ -50,28 +49,27 @@ func main() {
 	var precomputeOut core.DeviceSlice
 	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())

-	err := bn254.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	err := bn254.PrecomputePoints(points, 1024, &cfg, precomputeOut)
 	if err != cr.CudaSuccess {
 		log.Fatalf("PrecomputeBases failed: %v", err)
 	}
 }
 ```

-#### `G2PrecomputeBases`
+#### `G2PrecomputePoints`

-This method is the same as `PrecomputeBases` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.
+This method is the same as `PrecomputePoints` but for G2 points. Extends each G2 curve base point with its multiples for optimized MSM computations.

 ```go
-func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError
 ```

 ##### Parameters

- **`points`**: A slice of G2 curve points to be extended.
- **`precomputeFactor`**: The total number of points to precompute for each base.
- **`c`**: Reserved for future use to ensure compatibility with MSM operations.
- **`ctx`**: Specifies the CUDA device context for execution.
- **`outputBases`**: Allocated device slice for the extended bases.
+- **`points`**: A slice of the original affine points to be extended with their multiples.
+- **`msmSize`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
+- **`outputBases`**: The device slice allocated for storing the extended points.

 ##### Example

@@ -93,20 +91,9 @@ func main() {
 	var precomputeOut core.DeviceSlice
 	precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())

-	err := g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
+	err := g2.G2PrecomputePoints(points, 1024, 0, &cfg, precomputeOut)
 	if err != cr.CudaSuccess {
 		log.Fatalf("PrecomputeBases failed: %v", err)
 	}
 }
 ```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/golang-bindings/msm.md
+++ b/docs/docs/icicle/golang-bindings/msm.md
@@ -6,52 +6,53 @@
 package main

 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
 )

 func main() {
-  // Obtain the default MSM configuration.
-  cfg := bn254.GetDefaultMSMConfig()
+	// Obtain the default MSM configuration.
+	cfg := core.GetDefaultMSMConfig()

-  // Define the size of the problem, here 2^18.
-  size := 1 << 18
+	// Define the size of the problem, here 2^18.
+	size := 1 << 18

-  // Generate scalars and points for the MSM operation.
-  scalars := bn254.GenerateScalars(size)
-  points := bn254.GenerateAffinePoints(size)
+	// Generate scalars and points for the MSM operation.
+	scalars := bn254.GenerateScalars(size)
+	points := bn254.GenerateAffinePoints(size)

-  // Create a CUDA stream for asynchronous operations.
-  stream, _ := cr.CreateStream()
-  var p bn254.Projective
+	// Create a CUDA stream for asynchronous operations.
+	stream, _ := cr.CreateStream()
+	var p bn254.Projective

-  // Allocate memory on the device for the result of the MSM operation.
-  var out core.DeviceSlice
-  _, e := out.MallocAsync(p.Size(), p.Size(), stream)
+	// Allocate memory on the device for the result of the MSM operation.
+	var out core.DeviceSlice
+	_, e := out.MallocAsync(p.Size(), p.Size(), stream)

-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}

-  // Set the CUDA stream in the MSM configuration.
-  cfg.Ctx.Stream = &stream
-  cfg.IsAsync = true
+	// Set the CUDA stream in the MSM configuration.
+	cfg.Ctx.Stream = &stream
+	cfg.IsAsync = true

-  // Perform the MSM operation.
-  e = bn254.Msm(scalars, points, &cfg, out)
+	// Perform the MSM operation.
+	e = bn254_msm.Msm(scalars, points, &cfg, out)

-  if e != cr.CudaSuccess {
-    panic(e)
-  }
+	if e != cr.CudaSuccess {
+		panic(e)
+	}

-  // Allocate host memory for the results and copy the results from the device.
-  outHost := make(core.HostSlice[bn254.Projective], 1)
-  cr.SynchronizeStream(&stream)
-  outHost.CopyFromDevice(&out)
+	// Allocate host memory for the results and copy the results from the device.
+	outHost := make(core.HostSlice[bn254.Projective], 1)
+	cr.SynchronizeStream(&stream)
+	outHost.CopyFromDevice(&out)

-  // Free the device memory allocated for the results.
-  out.Free()
+	// Free the device memory allocated for the results.
+	out.Free()
 }

 ```
@@ -121,7 +122,7 @@ func GetDefaultMSMConfig() MSMConfig

 ## How do I toggle between the supported algorithms?

-When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle accumulation and `cfg.Ctx.IsBigTriangle = false` will activate Bucket accumulation.
+When creating your MSM Config you may state which algorithm you wish to use. `cfg.Ctx.IsBigTriangle = true` will activate Large triangle reduction and `cfg.Ctx.IsBigTriangle = false` will activate iterative reduction.

 ```go
 ...
@@ -151,6 +152,10 @@ out.Malloc(batchSize*p.Size(), p.Size())
 ...
 ```

+## Parameters for optimal performance
+
+Please refer to the [primitive description](../primitives/msm#choosing-optimal-parameters)
+
 ## Support for G2 group

 To activate G2 support first you must make sure you are building the static libraries with G2 feature enabled as described in the [Golang building instructions](../golang-bindings.md#using-icicle-golang-bindings-in-your-project).
@@ -169,23 +174,23 @@ This package include `G2Projective` and `G2Affine` points as well as a `G2Msm` m
 package main

 import (
-  "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
-  bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
-  g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	g2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
 )

 func main() {
-  cfg := bn254.GetDefaultMSMConfig()
-  size := 1 << 12
-  batchSize := 3
-  totalSize := size * batchSize
-  scalars := bn254.GenerateScalars(totalSize)
-  points := g2.G2GenerateAffinePoints(totalSize)
+	cfg := core.GetDefaultMSMConfig()
+	size := 1 << 12
+	batchSize := 3
+	totalSize := size * batchSize
+	scalars := bn254.GenerateScalars(totalSize)
+	points := g2.G2GenerateAffinePoints(totalSize)

-  var p g2.G2Projective
-  var out core.DeviceSlice
-  out.Malloc(batchSize*p.Size(), p.Size())
-  g2.G2Msm(scalars, points, &cfg, out)
+	var p g2.G2Projective
+	var out core.DeviceSlice
+	out.Malloc(batchSize*p.Size(), p.Size())
+	g2.G2Msm(scalars, points, &cfg, out)
 }

 ```
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -1,5 +1,9 @@
 # Polynomial API Overview

+:::note
+Read our paper on the Polynomials API in ICICLE v2 by clicking [here](https://eprint.iacr.org/2024/973).
+:::
+
 ## Introduction

 The Polynomial API offers a robust framework for polynomial operations within a computational environment. It's designed for flexibility and efficiency, supporting a broad range of operations like arithmetic, evaluation, and manipulation, all while abstracting from the computation and storage specifics. This enables adaptability to various backend technologies, employing modern C++ practices.
@@ -128,12 +132,13 @@ auto H = (A*B-C).divide_by_vanishing_polynomial(N);

 ### Evaluation

-Evaluate polynomials at arbitrary domain points or across a domain.
+Evaluate polynomials at arbitrary domain points, across a domain or on a roots-of-unity domain.

 ```cpp
 Image operator()(const Domain& x) const; // evaluate f(x)
 void evaluate(const Domain* x, Image* evals /*OUT*/) const;
 void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory
 ```

 Example:
@@ -147,18 +152,13 @@ uint64_t domain_size = ...;
 auto domain = /*build domain*/; // host or device memory
 auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
 f.evaluate_on_domain(domain, domain_size, evaluations);
+
+// evaluate f(x) on roots of unity domain
+uint64_t domain_log_size = ...;
+auto evaluations_rou_domain = std::make_unique<scalar_t[]>(1 << domain_log_size); // can be device memory too
+f.evaluate_on_rou_domain(domain_log_size, evaluations_rou_domain);
 ```

-:::note
-For special domains such as roots of unity, this method is not the most efficient for two reasons:
-
- Need to build the domain of size N.
- The implementation is not trying to identify this special domain.
-
-Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
-See the 'device views' section for more details.
-:::
-
 ### Manipulations

 Beyond arithmetic, the API supports efficient polynomial manipulations:
@@ -255,7 +255,7 @@ auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);

 #### Views

-The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
+The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.

 ##### What is a Memory View?

@@ -265,7 +265,7 @@ A memory view is essentially a pointer to data stored in device memory. By provi

 Memory views are extremely versatile and can be employed in various computational contexts such as:

- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
+- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM).
 - **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.

 ##### Obtaining and Using Views
@@ -275,9 +275,6 @@ To create and use views within the Polynomial API, functions are provided to obt
 ```cpp
 // Obtain a view of the polynomial's coefficients
 std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
-std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
 ```

 Example usage:
@@ -328,22 +325,7 @@ if (coeff_view.isValid()) {
 }
 ```

-#### Evaluations View: Accessing Polynomial Evaluations Efficiently

-The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
-This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
-
-:::warning
-Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
-:::
-
-```cpp
-// Assume a polynomial `p` of degree N
-auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
-
-// Use the evaluations view to perform further computations or visualizations
-process_polynomial_evaluations(evals_view.get(), size, device_id);
-```

 ## Multi-GPU Support with CUDA Backend

--- a/docs/docs/icicle/primitives/msm.md
+++ b/docs/docs/icicle/primitives/msm.md
@@ -54,36 +54,142 @@ You can learn more about how MSMs work from this [video](https://www.youtube.com
 - [Golang](../golang-bindings/msm.md)
 - [Rust](../rust-bindings//msm.md)

-## Supported algorithms
+## Algorithm description

-Our MSM implementation supports two algorithms `Bucket accumulation` and `Large triangle accumulation`.
+We follow the bucket method algorithm. The GPU implementation consists of four phases:

-### Bucket accumulation
+1. Preparation phase - The scalars are split into smaller scalars of `c` bits each. These are the bucket indices. The points are grouped according to their corresponding bucket index and the buckets are sorted by size.
+2. Accumulation phase - Each bucket accumulates all of its points using a single thread. More than one thread is assigned to large buckets, in proportion to their size. A bucket is considered large if its size is above the large bucket threshold that is determined by the `large_bucket_factor` parameter. The large bucket threshold is the expected average bucket size times the `large_bucket_factor` parameter.
+3. Buckets Reduction phase - bucket results are multiplied by their corresponding bucket number and each bucket module is reduced to a small number of final results. By default, this is done by an iterative algorithm which is highly parallel. Setting `is_big_triangle` to `true` will switch this phase to the running sum algorithm described in the above YouTube talk which is much less parallel.
+4. Final accumulation phase - The final results from the last phase are accumulated using the double-and-add algorithm.

-The Bucket Accumulation algorithm is a method of dividing the overall MSM task into smaller, more manageable sub-tasks. It involves partitioning scalars and their corresponding points into different "buckets" based on the scalar values.
+## Batched MSM

-Bucket Accumulation can be more parallel-friendly because it involves dividing the computation into smaller, independent tasks, distributing scalar-point pairs into buckets and summing points within each bucket. This division makes it well suited for parallel processing on GPUs.
+The MSM supports batch mode - running multiple MSMs in parallel. It's always better to use the batch mode instead of running single msms in serial as long as there is enough memory available. We support running a batch of MSMs that share the same points as well as a batch of MSMs that use different points.

-#### When should I use Bucket accumulation?
+## MSM configuration

-In scenarios involving large MSM computations with many scalar-point pairs, the ability to parallelize operations makes Bucket Accumulation more efficient. The larger the MSM task, the more significant the potential gains from parallelization.
+```cpp
+  /**
+   * @struct MSMConfig
+   * Struct that encodes MSM parameters to be passed into the [MSM](@ref MSM) function. The intended use of this struct
+   * is to create it using [default_msm_config](@ref default_msm_config) function and then you'll hopefully only need to
+   * change a small number of default values for each of your MSMs.
+   */
+  struct MSMConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    int points_size;         /**< Number of points in the MSM. If a batch of MSMs needs to be computed, this should be
+                              *   a number of different points. So, if each MSM re-uses the same set of points, this
+                              *   variable is set equal to the MSM size. And if every MSM uses a distinct set of
+                              *   points, it should be set to the product of MSM size and [batch_size](@ref
+                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
+    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
+                              *   [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
+                              *   there needs to be equal to the one used here. Larger values decrease the
+                              *   number of computations to make, on-line memory footprint, but increase the static
+                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
+    int c;                   /**< \f$ c \f$ value, or "window bitsize" which is the main parameter of the "bucket
+                              *   method" that we use to solve the MSM problem. As a rule of thumb, larger value
+                              *   means more on-line memory footprint but also more parallelism and less computational
+                              *   complexity (up to a certain point). Currently pre-computation is independent of
+                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
+                              *   [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
+                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
+    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
+                              *   but if a different (better) upper bound is known, it should be reflected in this
+                              *   variable. Default value: 0 (set to the bitsize of scalar field). */
+    int large_bucket_factor; /**< Variable that controls how sensitive the algorithm is to the buckets that occur
+                              *   very frequently. Useful for efficient treatment of non-uniform distributions of
+                              *   scalars and "top windows" with few bits. Can be set to 0 to disable separate
+                              *   treatment of large buckets altogether. Default value: 10. */
+    int batch_size;          /**< The number of MSMs to compute. Default value: 1. */
+    bool are_scalars_on_device;       /**< True if scalars are on device and false if they're on host. Default value:
+                                       *   false. */
+    bool are_scalars_montgomery_form; /**< True if scalars are in Montgomery form and false otherwise. Default value:
+                                       *   true. */
+    bool are_points_on_device; /**< True if points are on device and false if they're on host. Default value: false. */
+    bool are_points_montgomery_form; /**< True if coordinates of points are in Montgomery form and false otherwise.
+                                      *   Default value: true. */
+    bool are_results_on_device; /**< True if the results should be on device and false if they should be on host. If set
+                                 *   to false, `is_async` won't take effect because a synchronization is needed to
+                                 *   transfer results to the host. Default value: false. */
+    bool is_big_triangle;       /**< Whether to do "bucket accumulation" serially. Decreases computational complexity
+                                 *   but also greatly decreases parallelism, so only suitable for large batches of MSMs.
+                                 *   Default value: false. */
+    bool is_async;              /**< Whether to run the MSM asynchronously. If set to true, the MSM function will be
+                                 *   non-blocking and you'd need to synchronize it explicitly by running
+                                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the MSM
+                                 *   function will block the current CPU thread. */
+  };
+```

-### Large triangle accumulation
+## Choosing optimal parameters

-Large Triangle Accumulation is a method for optimizing MSM which focuses on reducing the number of point doublings in the computation. This algorithm is based on the observation that the number of point doublings can be minimized by structuring the computation in a specific manner.
+`is_big_triangle` should be `false` in almost all cases. It might provide better results only for very small MSMs (smaller than 2^8^) with a large batch (larger than 100) but this should be tested per scenario.
+Large buckets exist in two cases:
+1. When the scalar distribution isn't uniform.
+2. When `c` does not divide the scalar bit-size.

-#### When should I use Large triangle accumulation?
+`large_bucket_factor` that is equal to 10 yields good results for most cases, but it's best to fine tune this parameter per `c` and per scalar distribution.
+The two most important parameters for performance are `c` and the `precompute_factor`. They affect the number of EC additions as well as the memory size. When the points are not known in advance we cannot use precomputation. In this case the best `c` value is usually around $log_2(msmSize) - 4$. However, in most protocols the points are known in advanced and precomputation can be used unless limited by memory. Usually it's best to use maximum precomputation (such that we end up with only a single bucket module) combined we a `c` value around $log_2(msmSize) - 1$.

-The Large Triangle Accumulation algorithm is more sequential in nature, as it builds upon each step sequentially (accumulating sums and then performing doubling). This structure can make it less suitable for parallelization but potentially more efficient for a **large batch of smaller MSM computations**.
+## Memory usage estimation

-## MSM Modes
+The main memory requirements of the MSM are the following:

-ICICLE MSM also supports two different modes `Batch MSM` and `Single MSM`
+- Scalars - `sizeof(scalar_t) * msm_size * batch_size`
+- Scalar indices - `~6 * sizeof(unsigned) * nof_bucket_modules * msm_size * batch_size`
+- Points - `sizeof(affine_t) * msm_size * precomp_factor * batch_size`
+- Buckets - `sizeof(projective_t) * nof_bucket_modules * 2^c * batch_size`

-Batch MSM allows you to run many MSMs with a single API call while single MSM will launch a single MSM computation.
+where `nof_bucket_modules =  ceil(ceil(bitsize / c) / precompute_factor)`

-### Which mode should I use?
+During the MSM computation first the memory for scalars and scalar indices is allocated, then the indices are freed and points and buckets are allocated. This is why a good estimation for the required memory is the following formula:

-This decision is highly dependent on your use case and design. However, if your design allows for it, using batch mode can significantly improve efficiency. Batch processing allows you to perform multiple MSMs simultaneously, leveraging the parallel processing capabilities of GPUs.
+$max(scalars + scalarIndices, scalars + points + buckets)$

-Single MSM mode should be used when batching isn't possible or when you have to run a single MSM.
+This gives a good approximation within 10% of the actual required memory for most cases.
+
+## Example parameters
+
+Here is a useful table showing optimal parameters for different MSMs. They are optimal for BLS12-377 curve when running on NVIDIA GeForce RTX 3090 Ti. This is the configuration used:
+
+```cpp
+  msm::MSMConfig config = {
+    ctx,            // DeviceContext
+    N,              // points_size
+    precomp_factor, // precompute_factor
+    user_c,         // c
+    0,              // bitsize
+    10,             // large_bucket_factor
+    batch_size,     // batch_size
+    false,          // are_scalars_on_device
+    false,          // are_scalars_montgomery_form
+    true,           // are_points_on_device
+    false,          // are_points_montgomery_form
+    true,           // are_results_on_device
+    false,          // is_big_triangle
+    true            // is_async
+  };
+```
+
+Here are the parameters and the results for the different cases:
+
+| MSM size | Batch size | Precompute factor | c | Memory estimation (GB) | Actual memory (GB) | Single MSM time (ms) |
+| --- | --- | --- | --- | --- | --- | --- |
+| 10 | 1 | 1 | 9 | 0.00227 | 0.00277 | 9.2 |
+| 10 | 1 | 23 | 11 | 0.00259 | 0.00272 | 1.76 |
+| 10 | 1000 | 1 | 7 | 0.94 | 1.09 | 0.051 |
+| 10 | 1000 | 23 | 11 | 2.59 | 2.74 | 0.025 |
+| 15 | 1 | 1 | 11 | 0.011 | 0.019 | 9.9 |
+| 15 | 1 | 16 | 16 | 0.061 | 0.065 | 2.4 |
+| 15 | 100 | 1 | 11 | 1.91 | 1.92 | 0.84 |
+| 15 | 100 | 19 | 14 | 6.32 | 6.61 | 0.56 |
+| 18 | 1 | 1 | 14 | 0.128 | 0.128 | 14.4 |
+| 18 | 1 | 15 | 17 | 0.40 | 0.42 | 5.9 |
+| 22 | 1 | 1 | 17 | 1.64 | 1.65 | 68 |
+| 22 | 1 | 13 | 21 | 5.67 | 5.94 | 54 |
+| 24 | 1 | 1 | 18 | 6.58 | 6.61 | 232 |
+| 24 | 1 | 7 | 21 | 12.4 | 13.4 | 199 |
+
+The optimal values can vary per GPU and per curve. It is best to try a few combinations until you get the best results for your specific case.
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -16,7 +16,7 @@ Poseidon starts with the initialization of its internal state, which is composed

 This is done to prevent collisions and to prevent certain cryptographic attacks by ensuring that the internal state is sufficiently mixed and unpredictable.

-![Alt text](image.png)
+![Poseidon initialization of internal state added with pre-generated round constants](https://github.com/ingonyama-zk/icicle/assets/122266060/52257f5d-6097-47c4-8f17-7b6449b9d162)

 ## Applying full and partial rounds

@@ -26,9 +26,9 @@ To generate a secure hash output, the algorithm goes through a series of "full r

 ### Full rounds

-![Alt text](image-1.png)
+![Full round iterations consisting of S box operations, adding round constants, and a Full MDS matrix multiplication](https://github.com/ingonyama-zk/icicle/assets/122266060/e4ce0e98-b90b-4261-b83e-3cd8cce069cb)

-**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the the implementation and user configuration.
+**Uniform Application of S-box:** In full rounds, the S-box (a non-linear transformation) is applied uniformly to every element of the hash function's internal state. This ensures a high degree of mixing and diffusion, contributing to the hash function's security. The functions S-box involves raising each element of the state to a certain power denoted by `α` a member of the finite field defined by the prime `p`; `α` can be different depending on the implementation and user configuration.

 **Linear Transformation:** After applying the S-box, a linear transformation is performed on the state. This involves multiplying the state by a MDS (Maximum Distance Separable) Matrix. which further diffuses the transformations applied by the S-box across the entire state.

@@ -36,14 +36,14 @@ To generate a secure hash output, the algorithm goes through a series of "full r

 ### Partial Rounds

+![Partial round iterations consisting of selective S box operation, adding a round constant and performing an MDS multiplication with a sparse matrix](https://github.com/ingonyama-zk/icicle/assets/122266060/e8c198b4-7aa4-4b4d-9ec4-604e39e07692)
+
 **Selective Application of S-Box:** Partial rounds apply the S-box transformation to only one element of the internal state per round, rather than to all elements. This selective application significantly reduces the computational complexity of the hash function without compromising its security. The choice of which element to apply the S-box to can follow a specific pattern or be fixed, depending on the design of the hash function.

 **Linear Transformation and Round Constants:** A linear transformation is performed and round constants are added. The linear transformation in partial rounds can be designed to be less computationally intensive (this is done by using a sparse matrix) than in full rounds, further optimizing the function's efficiency.

 The user of Poseidon can often choose how many partial or full rounds he wishes to apply; more full rounds will increase security but degrade performance. The choice and balance is highly dependent on the use case.

-![Alt text](image-2.png)
-
 ## Using Poseidon

 ICICLE Poseidon is implemented for GPU and parallelization is performed for each element of the state rather than for each state.
@@ -59,7 +59,7 @@ So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elem

 Poseidon is extremely customizable and using different constants will produce different hashes, security levels and performance results.

-We support pre-calculated and optimized constants for each of the [supported curves](#supported-curves).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.
+We support pre-calculated and optimized constants for each of the [supported curves](../core#supported-curves-and-operations).The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants) and are labeled clearly per curve `<curve_name>_poseidon.h`.

 If you wish to generate your own constants you can use our python script which can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon/constants/generate_parameters.py).

--- a/docs/docs/icicle/rust-bindings/msm-pre-computation.md
+++ b/docs/docs/icicle/rust-bindings/msm-pre-computation.md
@@ -2,26 +2,24 @@

 To understand the theory behind MSM pre computation technique refer to Niall Emmart's [talk](https://youtu.be/KAWlySN7Hm8?feature=shared&t=1734).

-## `precompute_bases`
+## `precompute_points`

 Precomputes bases for the multi-scalar multiplication (MSM) by extending each base point with its multiples, facilitating more efficient MSM calculations.

 ```rust
-pub fn precompute_bases<C: Curve + MSM<C>>(
-    points: &HostOrDeviceSlice<Affine<C>>,
-    precompute_factor: i32,
-    _c: i32,
-    ctx: &DeviceContext,
-    output_bases: &mut HostOrDeviceSlice<Affine<C>>,
+pub fn precompute_points<C: Curve + MSM<C>>(
+    points: &(impl HostOrDeviceSlice<Affine<C>> + ?Sized),
+    msm_size: i32,
+    cfg: &MSMConfig,
+    output_bases: &mut DeviceSlice<Affine<C>>,
 ) -> IcicleResult<()>
 ```

 ### Parameters

 - **`points`**: The original set of affine points (\(P_1, P_2, ..., P_n\)) to be used in the MSM. For batch MSM operations, this should include all unique points concatenated together.
- **`precompute_factor`**: Specifies the total number of points to precompute for each base, including the base point itself. This parameter directly influences the memory requirements and the potential speedup of the MSM operation.
- **`_c`**: Currently unused. Intended for future use to align with the `c` parameter in `MSMConfig`, ensuring the precomputation is compatible with the bucket method's window size used in MSM.
- **`ctx`**: The device context specifying the device ID and stream for execution. This context determines where the precomputation is performed (e.g., on a specific GPU).
+- **`msm_size`**: The size of a single msm in order to determine optimal parameters.
+- **`cfg`**: The MSM configuration parameters.
 - **`output_bases`**: The output buffer for the extended bases. Its size must be `points.len() * precompute_factor`. This buffer should be allocated on the device for GPU computations.

 #### Returns
@@ -37,22 +35,11 @@ The precomputation process is crucial for optimizing MSM operations, especially
 #### Example Usage

 ```rust
-let device_context = DeviceContext::default_for_device(0); // Use the default device
+let cfg = MSMConfig::default();
 let precompute_factor = 4; // Number of points to precompute
 let mut extended_bases = HostOrDeviceSlice::cuda_malloc(expected_size).expect("Failed to allocate memory for extended bases");

 // Precompute the bases using the specified factor
-precompute_bases(&points, precompute_factor, 0, &device_context, &mut extended_bases)
+precompute_points(&points, msm_size, &cfg, &mut extended_bases)
    .expect("Failed to precompute bases");
 ```
-
-### Benchmarks
-
-Benchmarks where performed on a Nvidia RTX 3090Ti.
-
-| Pre-computation factor | bn254 size `2^20` MSM, ms.  | bn254 size `2^12` MSM, size `2^10` batch, ms. | bls12-381 size `2^20` MSM, ms. | bls12-381 size `2^12` MSM, size `2^10` batch, ms. |
-| ------------- | ------------- | ------------- | ------------- | ------------- |
-| 1  | 14.1  | 82.8  | 25.5  | 136.7  |
-| 2  | 11.8  | 76.6  | 20.3  | 123.8  |
-| 4  | 10.9  | 73.8  | 18.1  | 117.8  |
-| 8  | 10.6  | 73.7  | 17.2  | 116.0  |
--- a/docs/docs/icicle/rust-bindings/msm.md
+++ b/docs/docs/icicle/rust-bindings/msm.md
@@ -100,7 +100,7 @@ When performing MSM operations, it's crucial to match the size of the `scalars`

 ## How do I toggle between the supported algorithms?

-When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle accumulation and `is_big_triangle=false` will activate Bucket accumulation.
+When creating your MSM Config you may state which algorithm you wish to use. `is_big_triangle=true` will activate Large triangle reduction and `is_big_triangle=false` will activate iterative reduction.

 ```rust
 ...
@@ -144,6 +144,10 @@ msm::msm(&scalars, &points, &cfg, &mut msm_results).unwrap();

 Here is a [reference](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/wrappers/rust/icicle-core/src/msm/mod.rs#L108) to the code which automatically sets the batch size. For more MSM examples have a look [here](https://github.com/ingonyama-zk/icicle/blob/77a7613aa21961030e4e12bf1c9a78a2dadb2518/examples/rust/msm/src/main.rs#L1).

+## Parameters for optimal performance
+
+Please refer to the [primitive description](../primitives/msm#choosing-optimal-parameters)
+
 ## Support for G2 group

 MSM also supports G2 group.
--- a/docs/docs/icicle/rust-bindings/ntt.md
+++ b/docs/docs/icicle/rust-bindings/ntt.md
@@ -180,7 +180,7 @@ where

 - **`IcicleResult<()>`**: Will return an error if the operation fails.

-### Releaseing the domain
+### Releasing the domain

 The `release_domain` function is responsible for releasing the resources associated with a specific domain in the CUDA device context.

--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -67,6 +67,9 @@ where
        evals: &mut E,
    );

+    // Method to evaluate the polynomial over the roots-of-unity domain for power-of-two sized domain
+    fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, domain_log_size: u64, evals: &mut E);
+
    // Method to retrieve a coefficient at a specific index.
    fn get_coeff(&self, idx: u64) -> Self::Field;

@@ -228,6 +231,11 @@ let f_x = f.eval(&x);  // Evaluate f at x
 let domain = [one, two, three];
 let mut host_evals = vec![ScalarField::zero(); domain.len()];
 f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
+
+// Evaluate on roots-of-unity-domain
+let domain_log_size = 4;
+let mut device_evals = DeviceVec::<ScalarField>::cuda_malloc(1 << domain_log_size).unwrap();
+f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
 ```

 ### Read coefficients
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -166,6 +166,14 @@ const config = {
        additionalLanguages: ['rust', 'go'],
      },
      image: 'img/logo.png',
+      announcementBar: {
+        id: 'announcement', // Any value that will identify this message.
+        content:
+          '<strong>🎉 Read our paper on the Polynomials API in ICICLE v2 by clicking <a target="_blank" rel="noopener noreferrer" href="https://eprint.iacr.org/2024/973">here</a>! 🎉</strong>',
+        backgroundColor: '#ADD8E6', // Light blue background color.
+        textColor: '#000000', // Black text color.
+        isCloseable: true, // Defaults to `true`.
+      },
    }),
 };

--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -1,6 +1,6 @@
 # ZKContainer

-We recommend using [ZKContainer](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+We recommend using [ZKContainer](https://www.ingonyama.com/blog/product-announcement-zk-containers), where we have already preinstalled all the required dependencies, to run Icicle examples. 
 To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).

 In each example directory, ZKContainer files are located in a subdirectory `.devcontainer`. 
--- a/examples/c++/best-practice-ntt/CMakeLists.txt
+++ b/examples/c++/best-practice-ntt/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/best-practice-ntt/README.md
+++ b/examples/c++/best-practice-ntt/README.md
@@ -0,0 +1,33 @@
+# ICICLE best practices: Concurrent Data Transfer and NTT Computation
+
+The [Number Theoretic Transform (NTT)](https://dev.ingonyama.com/icicle/primitives/ntt) is an integral component of many cryptographic algorithms, such as polynomial multiplication in Zero Knowledge Proofs. The performance bottleneck of NTT on GPUs is the data transfer between the host (CPU) and the device (GPU). In a typical NVIDIA GPU this transfer dominates the total NTT execution time.
+
+## Key-Takeaway
+
+When you have to run several NTTs, consider Concurrent Data Download, Upload, and Computation to improve data bus (PCIe) and GPU utilization, and get better total execution time.
+
+Typically, you concurrently
+
+1. Download the output of a previous NTT back to the host
+2. Upload the input for a next NTT on the device
+3. Run current NTT
+
+> [!NOTE]
+> This approach requires two on-device memory vectors, decreasing the maximum size of NTT by 2x.
+
+## Best-Practices
+
+1. Use three separate CUDA streams for Download, Upload, and Compute operations
+2. Use pinned (page-locked) memory on host to speed data bus transfers. Calling `cudaHostAlloc` allocates pinned memory.
+3. Use in-place NTT to save on device memory.
+
+## Running the example
+
+To change the default curve BN254, edit `compile.sh` and `CMakeLists.txt`
+
+```sh
+./compile.sh
+./run.sh
+```
+
+To compare with ICICLE baseline (i.e. non-concurrent) NTT, you can run [this example](../ntt/README.md).
--- a/examples/c++/best-practice-ntt/compile.sh
+++ b/examples/c++/best-practice-ntt/compile.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=OFF -DMSM=OFF
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
+
--- a/examples/c++/best-practice-ntt/example.cu
+++ b/examples/c++/best-practice-ntt/example.cu
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <chrono>
+
+#include "curves/params/bn254.cuh"
+#include "api/bn254.h"
+using namespace bn254;
+using namespace ntt;
+
+const std::string curve = "BN254";
+
+typedef scalar_t S;
+typedef scalar_t E;
+
+const unsigned max_log_ntt_size = 27;
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E* elements)
+{
+  for (unsigned i = 0; i < ntt_size * nof_ntts; i++) {
+    elements[i] = E::from(i + 1);
+  }
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char** argv)
+{
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId = 0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+
+  std::cout << "Curve: " << curve << std::endl;
+
+  S basic_root = S::omega(max_log_ntt_size);
+
+  // change these parameters to match the desired NTT size and batch size
+  const unsigned log_ntt_size = 22;
+  const unsigned nof_ntts = 16;
+
+  std::cout << "log NTT size: " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+
+  std::cout << "Batch size: " << nof_ntts << std::endl;
+
+  // Create separate CUDA streams for overlapping data transfers and kernel execution.
+  cudaStream_t stream_compute, stream_h2d, stream_d2h;
+  cudaStreamCreate(&stream_compute);
+  cudaStreamCreate(&stream_h2d);
+  cudaStreamCreate(&stream_d2h);
+
+  // Create device context for NTT computation
+  auto ctx_compute = device_context::DeviceContext{
+    stream_compute, // stream
+    0,              // device_id
+    0,              // mempool
+  };
+
+  // Initialize NTT domain and configuration
+  bn254_initialize_domain(&basic_root, ctx_compute, /* fast twiddles */ true);
+  NTTConfig<S> config_compute = default_ntt_config<S>(ctx_compute);
+  config_compute.ntt_algorithm = NttAlgorithm::MixedRadix;
+  config_compute.batch_size = nof_ntts;
+  config_compute.are_inputs_on_device = true;
+  config_compute.are_outputs_on_device = true;
+  config_compute.is_async = true;
+
+  std::cout << "Concurrent Download, Upload, and Compute In-place NTT" << std::endl;
+  int nof_blocks = 32;
+  std::cout << "Number of blocks: " << nof_blocks << std::endl;
+  int block_size = ntt_size * nof_ntts / nof_blocks;
+
+  // on-host pinned data
+  E* h_inp[2];
+  E* h_out[2];
+  for (int i = 0; i < 2; i++) {
+    cudaHostAlloc((void**)&h_inp[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+    cudaHostAlloc((void**)&h_out[i], sizeof(E) * ntt_size * nof_ntts, cudaHostAllocDefault);
+  }
+
+  // on-device in-place data
+  // we need two on-device vectors to overlap data transfers with NTT kernel execution
+  E* d_vec[2];
+  for (int i = 0; i < 2; i++) {
+    cudaMalloc((void**)&d_vec[i], sizeof(E) * ntt_size * nof_ntts);
+  }
+
+  // initialize input data
+  initialize_input(ntt_size, nof_ntts, h_inp[0]);
+  initialize_input(ntt_size, nof_ntts, h_inp[1]);
+
+  cudaEvent_t compute_start, compute_stop;
+  cudaEventCreate(&compute_start);
+  cudaEventCreate(&compute_stop);
+
+  for (int run = 0; run < 10; run++) {
+    int vec_compute = run % 2;
+    int vec_transfer = (run + 1) % 2;
+    std::cout << "Run: " << run << std::endl;
+    std::cout << "Compute Vector: " << vec_compute << std::endl;
+    std::cout << "Transfer Vector: " << vec_transfer << std::endl;
+    START_TIMER(inplace);
+    cudaEventRecord(compute_start, stream_compute);
+    bn254_ntt_cuda(d_vec[vec_compute], ntt_size, NTTDir::kForward, config_compute, d_vec[vec_compute]);
+    cudaEventRecord(compute_stop, stream_compute);
+    // we have to delay upload to device relative to download from device by one block: preserve write after read
+    for (int i = 0; i <= nof_blocks; i++) {
+      if (i < nof_blocks) {
+        cudaMemcpyAsync(
+          &h_out[vec_transfer][i * block_size], &d_vec[vec_transfer][i * block_size], sizeof(E) * block_size,
+          cudaMemcpyDeviceToHost, stream_d2h);
+      }
+      if (i > 0) {
+        cudaMemcpyAsync(
+          &d_vec[vec_transfer][(i - 1) * block_size], &h_inp[vec_transfer][(i - 1) * block_size],
+          sizeof(E) * block_size, cudaMemcpyHostToDevice, stream_h2d);
+      }
+      // synchronize upload and download at the end of the block to ensure data integrity
+      cudaStreamSynchronize(stream_d2h);
+      cudaStreamSynchronize(stream_h2d);
+    }
+    // synchronize compute stream with the end of the computation
+    cudaEventSynchronize(compute_stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, compute_start, compute_stop);
+    END_TIMER(inplace, "Concurrent In-Place  NTT");
+    std::cout << "NTT time: " << milliseconds << " ms" << std::endl;
+  };
+
+  // Clean-up
+  for (int i = 0; i < 2; i++) {
+    cudaFree(d_vec[i]);
+    cudaFreeHost(h_inp[i]);
+    cudaFreeHost(h_out[i]);
+  }
+  cudaEventDestroy(compute_start);
+  cudaEventDestroy(compute_stop);
+  cudaStreamDestroy(stream_compute);
+  cudaStreamDestroy(stream_d2h);
+  cudaStreamDestroy(stream_h2d);
+  return 0;
+}
--- a/examples/c++/best-practice-ntt/run.sh
+++ b/examples/c++/best-practice-ntt/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -16,7 +16,7 @@ int main(int argc, char* argv[])
  int N = batch_size * msm_size;

  std::cout << "Part I: use G1 points" << std::endl;
-  
+
  std::cout << "Generating random inputs on-host" << std::endl;
  scalar_t* scalars = new scalar_t[N];
  affine_t* points = new affine_t[N];
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
    false, // is_async
  };
  config.batch_size = batch_size;
-  
+
  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
  cudaStream_t stream = config.ctx.stream;
  // Execute the MSM kernel
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -9,137 +9,148 @@
 using namespace poseidon;
 using namespace bn254;

-void checkCudaError(cudaError_t error) {
-    if (error != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
-        // Handle the error, e.g., exit the program or throw an exception.
-    }
+void checkCudaError(cudaError_t error)
+{
+  if (error != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+    // Handle the error, e.g., exit the program or throw an exception.
+  }
 }

 // these global constants go into template calls
 const int size_col = 11;

 // this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    // CHK_IF_RETURN(); I can't use it in a standard thread function
-    PoseidonConfig column_config = {
-        ctx,   // ctx
-        false, // are_inputes_on_device
-        false, // are_outputs_on_device
-        false, // input_is_a_state
-        false, // aligned
-        false, // loop_state
-        false, // is_async
-        };
-    cudaError_t err = bn254_poseidon_hash_cuda(layers, column_hashes, (size_t) size_partition, size_col, *constants, column_config);
-    checkCudaError(err);
+void threadPoseidon(
+  device_context::DeviceContext ctx,
+  unsigned size_partition,
+  scalar_t* layers,
+  scalar_t* column_hashes,
+  PoseidonConstants<scalar_t>* constants)
+{
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  // CHK_IF_RETURN(); I can't use it in a standard thread function
+  PoseidonConfig column_config = {
+    ctx,   // ctx
+    false, // are_inputes_on_device
+    false, // are_outputs_on_device
+    false, // input_is_a_state
+    false, // aligned
+    false, // loop_state
+    false, // is_async
+  };
+  cudaError_t err =
+    bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
+  checkCudaError(err);
 }

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

+#define CHECK_ALLOC(ptr)                                                                                               \
+  if ((ptr) == nullptr) {                                                                                              \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl;                                              \
+    exit(EXIT_FAILURE);                                                                                                \
+  }

-#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
-    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
-    exit(EXIT_FAILURE); \
-}
-
-int main() {
-    const unsigned size_row = (1<<30);
-    const unsigned nof_partitions = 64;
-    const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to reuse for different partitions
-    const uint32_t size_layers = size_col * size_partition;
-    
-    nvmlInit();
-    unsigned int deviceCount;
-    nvmlDeviceGetCount(&deviceCount);
-    std::cout << "Available GPUs: " << deviceCount << std::endl;
-
-    for (unsigned int i = 0; i < deviceCount; ++i) {
-        nvmlDevice_t device;
-        nvmlMemory_t memory;
-        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
-        nvmlDeviceGetHandleByIndex(i, &device);
-        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        nvmlDeviceGetMemoryInfo(device, &memory);
-        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
-    }
-
-    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
-    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
-
-    //===============================================================================
-    // Key: multiple devices are supported by device context
-    //===============================================================================
-
-    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
-    ctx0.device_id=0;
-    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
-    ctx1.device_id=1;
-    
-    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers0);
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    CHECK_ALLOC(layers1);
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
-
-    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash0);
-    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
-    CHECK_ALLOC(column_hash1);
-
-    PoseidonConstants<scalar_t> column_constants0, column_constants1;
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
-    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
-    if (err_result != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-        return; 
-    }
-    bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
-
-    std::cout << "Parallel execution of Poseidon threads" << std::endl;
-    START_TIMER(parallel);
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
-
-    // Wait for the threads to finish
-    thread0.join();
-    thread1.join();
-    END_TIMER(parallel,"2 GPUs");
-    std::cout << "Output Data from Thread 0: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 1: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    std::cout << "Sequential execution of Poseidon threads" << std::endl;
-    START_TIMER(sequential);
-    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-    thread2.join();
-    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
-    thread3.join();
-    END_TIMER(sequential,"1 GPU");
-    std::cout << "Output Data from Thread 2: ";
-    std::cout << column_hash0[0] << std::endl;
-    std::cout << "Output Data from Thread 3: ";
-    std::cout << column_hash1[0] << std::endl;
-
-    nvmlShutdown();
-    return 0;
+int main()
+{
+  const unsigned size_row = (1 << 30);
+  const unsigned nof_partitions = 64;
+  const unsigned size_partition = size_row / nof_partitions;
+  // layers is allocated only for one partition, need to reuse for different partitions
+  const uint32_t size_layers = size_col * size_partition;
+
+  nvmlInit();
+  unsigned int deviceCount;
+  nvmlDeviceGetCount(&deviceCount);
+  std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+  for (unsigned int i = 0; i < deviceCount; ++i) {
+    nvmlDevice_t device;
+    nvmlMemory_t memory;
+    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    nvmlDeviceGetHandleByIndex(i, &device);
+    nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+    nvmlDeviceGetMemoryInfo(device, &memory);
+    std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total / 1024 / 1024
+              << "/" << memory.free / 1024 / 1024 << std::endl;
+  }
+
+  const unsigned memory_partition = sizeof(scalar_t) * (size_col + 1) * size_partition / 1024 / 1024;
+  std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+  //===============================================================================
+  // Key: multiple devices are supported by device context
+  //===============================================================================
+
+  device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+  ctx0.device_id = 0;
+  device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+  ctx1.device_id = 1;
+
+  std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+  scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers0);
+  scalar_t s = scalar_t::zero();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers0[i] = s;
+    s = s + scalar_t::one();
+  }
+  scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+  CHECK_ALLOC(layers1);
+  s = scalar_t::zero() + scalar_t::one();
+  for (unsigned i = 0; i < size_col * size_partition; i++) {
+    layers1[i] = s;
+    s = s + scalar_t::one();
+  }
+
+  scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash0);
+  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+  CHECK_ALLOC(column_hash1);
+
+  PoseidonConstants<scalar_t> column_constants0, column_constants1;
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
+  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
+  if (err_result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+    return;
+  }
+  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+
+  std::cout << "Parallel execution of Poseidon threads" << std::endl;
+  START_TIMER(parallel);
+  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+  // Wait for the threads to finish
+  thread0.join();
+  thread1.join();
+  END_TIMER(parallel, "2 GPUs");
+  std::cout << "Output Data from Thread 0: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 1: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  std::cout << "Sequential execution of Poseidon threads" << std::endl;
+  START_TIMER(sequential);
+  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  thread2.join();
+  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+  thread3.join();
+  END_TIMER(sequential, "1 GPU");
+  std::cout << "Output Data from Thread 2: ";
+  std::cout << column_hash0[0] << std::endl;
+  std::cout << "Output Data from Thread 3: ";
+  std::cout << column_hash1[0] << std::endl;
+
+  nvmlShutdown();
+  return 0;
 }
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -17,7 +17,7 @@ int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_cont
  config.is_a_on_device = true;
  config.is_b_on_device = true;
  config.is_result_on_device = true;
-  cudaError_t err =  bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
+  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
  if (err != cudaSuccess) {
    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -100,7 +100,7 @@ int main(int argc, char** argv)
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }
-  
+
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
@@ -151,7 +151,7 @@ int main(int argc, char** argv)
  // validate multiplication here...

  // clean up and exit
-  free(host_in1); 
+  free(host_in1);
  free(host_in2);
  free(host_out);
  cudaFree(device_in1);
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -60,8 +60,8 @@ int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E* element

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
-
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char* argv[])
 {
@@ -89,16 +89,16 @@ int main(int argc, char* argv[])
  bn254_initialize_domain(&basic_root, ctx, true);
  // Create an NTTConfig instance
  NTTConfig<S> config = default_ntt_config<S>();
-  config.ntt_algorithm = NttAlgorithm::MixedRadix; 
+  config.ntt_algorithm = NttAlgorithm::MixedRadix;
  config.batch_size = nof_ntts;
  START_TIMER(MixedRadix);
  cudaError_t err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(MixedRadix, "MixedRadix NTT");
-  
+
  std::cout << "Validating output" << std::endl;
  validate_output(ntt_size, nof_ntts, output);

-  config.ntt_algorithm = NttAlgorithm::Radix2; 
+  config.ntt_algorithm = NttAlgorithm::Radix2;
  START_TIMER(Radix2);
  err = bn254_ntt_cuda(input, ntt_size, NTTDir::kForward, config, output);
  END_TIMER(Radix2, "Radix2 NTT");
--- a/examples/c++/pedersen-commitment/example.cu
+++ b/examples/c++/pedersen-commitment/example.cu
@@ -11,49 +11,47 @@ using namespace bn254;
 typedef point_field_t T;

 // modular power
-T modPow(T base, T exp) {
+T modPow(T base, T exp)
+{
  T r = T::one();
  T b = base;
  T e = exp;
  while (e != T::zero()) {
-      // If exp is odd, multiply the base with result
-      if (T::is_odd(e)) {
-          r = r * b;
-      }
-      // Now exp must be even, divide it by 2
-      e =T::div2(e);
-      b = b * b;
+    // If exp is odd, multiply the base with result
+    if (T::is_odd(e)) { r = r * b; }
+    // Now exp must be even, divide it by 2
+    e = T::div2(e);
+    b = b * b;
  }
  return r;
 }

 // Check if y2 is a quadratic residue using Euler's Criterion
-bool quadratic_residue(T y2) {
-  return modPow(y2, T::div2(T::zero() - T::one())) == T::one();
-}
+bool quadratic_residue(T y2) { return modPow(y2, T::div2(T::zero() - T::one())) == T::one(); }

 // modular square root adapted from:
 // https://github.com/ShahjalalShohag/code-library/blob/main/Number%20Theory/Tonelli%20Shanks%20Algorithm.cpp
-bool mySQRT(T a, T *result) {
+bool mySQRT(T a, T* result)
+{
  if (a == T::zero()) {
    *result = T::zero();
    return true;
  }
-  if (modPow(a, T::div2(T::zero() - T::one())) != T::one() ) {
+  if (modPow(a, T::div2(T::zero() - T::one())) != T::one()) {
    return false; // solution does not exist
  }
  // TODO: consider special cases
-  // if (p % 4 == 3) return power(a, (p + 1) / 4, p); 
-  T s = T::zero() - T::one(); // p - 1, 
-  T n = T::one() + T::one(); //2;
-  T r = T::zero(); 
+  // if (p % 4 == 3) return power(a, (p + 1) / 4, p);
+  T s = T::zero() - T::one(); // p - 1,
+  T n = T::one() + T::one();  // 2;
+  T r = T::zero();
  T m;
  while (T::is_even(s)) {
    r = r + T::one();
-    s = T::div2(s); //s /= 2;
+    s = T::div2(s); // s /= 2;
  }
  // find a non-square mod p
-  while (modPow(n, T::div2((T::zero() - T::one())) ) != T::zero() - T::one()) {
+  while (modPow(n, T::div2((T::zero() - T::one()))) != T::zero() - T::one()) {
    n = n + T::one();
  }
  T x = modPow(a, T::div2(s + T::one()));
@@ -61,83 +59,86 @@ bool mySQRT(T a, T *result) {
  T g = modPow(n, s);
  for (;; r = m) {
    T t = b;
-    for (m = T::zero(); T::lt(m,r) /* m < r*/ && t != T::one(); m = m + T::one()) t =  t * t;
-    if (m == T::zero() ) {
+    for (m = T::zero(); T::lt(m, r) /* m < r*/ && t != T::one(); m = m + T::one())
+      t = t * t;
+    if (m == T::zero()) {
      *result = x;
      return true;
    }
-    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()) );
-    g = gs * gs ;
-    x = x * gs ;
-    b =  b * g ;
+    T gs = modPow(g, modPow(T::one() + T::one(), r - m - T::one()));
+    g = gs * gs;
+    x = x * gs;
+    b = b * g;
  }
 }

-void point_near_x(T x, affine_t *point) {
-  const T wb = T { weierstrass_b };
+void point_near_x(T x, affine_t* point)
+{
+  const T wb = T{weierstrass_b};
  T y2;
-  while (y2 = x*x*x + wb, quadratic_residue(y2) == false)
-  {
+  while (y2 = x * x * x + wb, quadratic_residue(y2) == false) {
    x = x + T::one();
  };
  T y;
  bool found = mySQRT(y2, &y);
-  assert(y*y == y2);
+  assert(y * y == y2);
  point->x = x;
  point->y = y;
 }

 static int seed = 0;
 static HOST_INLINE T rand_host_seed()
-  {
-    std::mt19937_64 generator(seed++);
-    std::uniform_int_distribution<unsigned> distribution;
-    
-    T value;
-    for (unsigned i = 0; i <  T::TLC-1 ; i++)
+{
+  std::mt19937_64 generator(seed++);
+  std::uniform_int_distribution<unsigned> distribution;
+
+  T value;
+  for (unsigned i = 0; i < T::TLC - 1; i++)
    // TODO: use the full range of limbs: for (unsigned i = 0; i <  T::TLC ; i++)
-      value.limbs_storage.limbs[i] = distribution(generator);
-    // while (lt(Field{get_modulus()}, value))
-    //   value = value - Field{get_modulus()};
-    return value;
-  }
+    value.limbs_storage.limbs[i] = distribution(generator);
+  // while (lt(Field{get_modulus()}, value))
+  //   value = value - Field{get_modulus()};
+  return value;
+}

 using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
-#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());

 int main(int argc, char** argv)
 {
  const unsigned N = pow(2, 10);
  std::cout << "Commitment vector size: " << N << "+1 for salt (a.k.a blinding factor)" << std::endl;
-  T* xs = new T[N+1];
-  
+  T* xs = new T[N + 1];
+
  std::cout << "Generating random points transparently using publicly chosen seed" << std::endl;
-  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment" << std::endl;
+  std::cout << "Public seed prevents committer from knowing the discrete logs of points used in the commitment"
+            << std::endl;
  seed = 1234;
  std::cout << "Using seed: " << seed << std::endl;
  std::cout << "Generating random field values" << std::endl;
  START_TIMER(gen);
-  
+
  for (unsigned i = 0; i < N; i++) {
    xs[i] = rand_host_seed();
  }
  END_TIMER(gen, "Time to generate field values");
-  std::cout << "xs[0]: " << xs[0]  << std::endl;
-  std::cout << "xs[1]: " << xs[1]  << std::endl;
-  
+  std::cout << "xs[0]: " << xs[0] << std::endl;
+  std::cout << "xs[1]: " << xs[1] << std::endl;
+
  // affine_t points[N];
-  affine_t* points = new affine_t[N+1];
+  affine_t* points = new affine_t[N + 1];
  std::cout << "Generating point about random field values" << std::endl;
  START_TIMER(points);
-  for (unsigned i = 0; i < N+1; i++) {
+  for (unsigned i = 0; i < N + 1; i++) {
    point_near_x(xs[i], &points[i]);
  }
  END_TIMER(points, "Time to generate points");
-  
+
  std::cout << "Generating commitment vector" << std::endl;
  projective_t result;
-  scalar_t* scalars = new scalar_t[N+1];
+  scalar_t* scalars = new scalar_t[N + 1];
  scalar_t::rand_host_many(scalars, N);

  std::cout << "Generating salt" << std::endl;
@@ -146,7 +147,7 @@ int main(int argc, char** argv)
  std::cout << "Executing MSM" << std::endl;
  auto config = msm::default_msm_config();
  START_TIMER(msm);
-  bn254_msm_cuda(scalars, points, N+1, config, &result);
+  bn254_msm_cuda(scalars, points, N + 1, config, &result);
  END_TIMER(msm, "Time to execute MSM");

  std::cout << "Computed commitment: " << result << std::endl;
--- a/examples/c++/polynomial-api/CMakeLists.txt
+++ b/examples/c++/polynomial-api/CMakeLists.txt
@@ -23,5 +23,8 @@ set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_include_directories(example PRIVATE "../../../icicle/include")

 # can link to another curve/field by changing the following lib and FIELD_ID
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+target_link_libraries(example 
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a
+${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a
+)
 target_compile_definitions(example PUBLIC FIELD_ID BN254)
--- a/examples/c++/polynomial-api/example.cu
+++ b/examples/c++/polynomial-api/example.cu
@@ -1,13 +1,16 @@
 #include <iostream>
-
+#include <cassert>
 #include "polynomials/polynomials.h"
 #include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
 #include "ntt/ntt.cuh"
 #include "poseidon/tree/merkle.cuh"
+#include "api/bn254.h"
+#include <chrono>

 // using namespace field_config;
 using namespace polynomials;
 using namespace merkle;
+using namespace bn254;

 // define the polynomial type
 typedef Polynomial<scalar_t> Polynomial_t;
@@ -21,6 +24,27 @@ const auto four = scalar_t::from(4);
 const auto five = scalar_t::from(5);
 const auto minus_one = zero - one;

+static std::unique_ptr<scalar_t[]> generate_pows(scalar_t tau, uint32_t size){
+    auto vec = std::make_unique<scalar_t[]>(size);
+    vec[0] = scalar_t::one();
+    for (size_t i = 1; i < size; ++i) {
+      vec[i] = vec[i-1] * tau;
+  }
+  return std::move(vec);
+}
+
+static std::unique_ptr<affine_t[]> generate_SRS(uint32_t size) {
+  auto secret_scalar = scalar_t::rand_host();
+  auto gen = projective_t::generator();
+  auto pows_of_tau = generate_pows(secret_scalar,size);
+  auto SRS = std::make_unique<affine_t[]>(size);
+  for (size_t i = 0; i < size; ++i) {
+      SRS[i] = projective_t::to_affine(pows_of_tau[i] * gen);
+  }
+  return std::move(SRS);
+}
+
+
 void example_evaluate()
 {
  std::cout << std::endl << "Example: Polynomial evaluation on random value" << std::endl;
@@ -298,6 +322,102 @@ void example_device_memory_view()
  ntt::ntt(d_coeffs.get(), size, ntt::NTTDir::kForward, ntt_config, coset_evals.get());
 }

+
+void example_commit_with_device_memory_view()
+{
+  //declare time vars
+  std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
+  std::chrono::milliseconds duration;
+
+  std::cout << std::endl << "Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1" << std::endl;
+  std::cout<< "Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1" << std::endl;
+  int N = 1025;
+
+  //generate group elements string of length N: (1, beta,beta^2....,beta^{N-1}). g
+  std::cout << "Setup: Generating mock SRS" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto SRS = generate_SRS(2*N);
+  //Allocate memory on device (points)
+  affine_t* points_d;
+  cudaMalloc(&points_d, sizeof(affine_t)* 2 * N);
+  // copy SRS to device (could have generated on device, but gives an indicator)
+  cudaMemcpy(points_d, SRS.get(), sizeof(affine_t)* 2 * N, cudaMemcpyHostToDevice);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: SRS of length "<< N << " generated and loaded to device. Took: " << duration.count() << " milliseconds" << std::endl;
+  
+  //goal:
+  //test commitment equality [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1
+  //test commitment equality [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 *f_2]_1
+  //note: using polyapi to gen scalars: already on device. 
+  std::cout << "Setup: Generating polys (on device) f1,f2 of log degree " << log2(N-1) << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto f1 = randomize_polynomial(N);
+  auto f2 = randomize_polynomial(N);
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Setup: Gen poly done. Took: " << duration.count() << " milliseconds" << std::endl;
+ 
+  //deg 2N constraints (f1+f2)^2 + (f1-f2)^2 = 2 (f1^2+ f_2^2)
+  std::cout << "Computing constraints..start "<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  auto L1 = (f1+f2)*(f1+f2) + (f1-f2)*(f1-f2);
+  auto R1 = scalar_t::from(2) * (f1*f1 + f2*f2);
+  //deg 2N constraints (f1+f2)^2 - (f1-f2)^2 = 4 f1 *f_2
+  auto L2 = (f1+f2)*(f1+f2) - (f1-f2)*(f1-f2);
+  auto R2 = scalar_t::from(4) * f1 * f2;
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Computing constraints..done. Took: " << duration.count() << " milliseconds"<< std::endl;
+  
+  // extract coeff using coeff view
+  auto [viewL1, sizeL1, device_idL1] = L1.get_coefficients_view();
+  auto [viewL2, sizeL2, device_idL2] = L2.get_coefficients_view(); 
+  auto [viewR1, sizeR1, device_idR1] = R1.get_coefficients_view();
+  auto [viewR2, sizeR2, device_idR2] = R2.get_coefficients_view();
+  
+  std::cout << "Computing Commitments with poly view"<< std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  msm::MSMConfig config = msm::default_msm_config();
+  config.are_points_on_device = true;
+  config.are_scalars_on_device = true;
+ 
+  //host vars (for result)
+  projective_t hL1{}, hL2{}, hR1{}, hR2{};
+
+  //straightforward msm bn254 api: no batching
+  bn254_msm_cuda(viewL1.get(),points_d,N,config,&hL1);
+  bn254_msm_cuda(viewL2.get(),points_d,N,config,&hL2);
+  bn254_msm_cuda(viewR1.get(),points_d,N,config,&hR1);
+  bn254_msm_cuda(viewR2.get(),points_d,N,config,&hR2);
+
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Commitments done. Took: " << duration.count() << " milliseconds"<< std::endl;
+ 
+  //sanity checks
+  auto affL1 = projective_t::to_affine(hL1);
+  auto affR1 = projective_t::to_affine(hR1);
+
+  auto affL2 = projective_t::to_affine(hL2);
+  auto affR2 = projective_t::to_affine(hR2);
+
+ //test commitment equality [(f1+f2)^2 + (f1-f2)^2]_1 = [4 (f_1^2+f_2^2]_1
+  assert(affL1.x==affR1.x && affL1.y==affR1.y);
+  std::cout << "commitment [(f1+f2)^2 + (f1-f2)^2]_1:" << std::endl; 
+  std::cout << "[x: " << affL1.x << ", y: " << affL1.y << "]" << std::endl;
+  std::cout << "commitment [[2 (f_1^2+f_2^2]_1:" <<std::endl;
+  std::cout << "[x: " << affR1.x << ", y: " << affR1.y << "]" << std::endl;
+
+  assert(affL2.x==affR2.x && affL2.y==affR2.y);
+  std::cout << "commitment [(f1+f2)^2 - (f1-f2)^2]_1:"<< std::endl;
+  std::cout << "[x: " << affL2.x << ", y: " << affL2.y << "]" << std::endl;
+  std::cout << "commitment [4 f_1*f_2]_1:"<<std::endl;
+  std::cout << "[x: " << affR2.x << ", y: " << affR2.y << "]" << std::endl;
+}
+
+
+
 int main(int argc, char** argv)
 {
  // Initialize NTT. TODO: can we hide this in the library?
@@ -324,6 +444,7 @@ int main(int argc, char** argv)
  example_even_odd();
  example_slice();
  example_device_memory_view();
+  example_commit_with_device_memory_view();

  return 0;
 }
--- a/examples/c++/polynomial_multiplication/example.cu
+++ b/examples/c++/polynomial_multiplication/example.cu
@@ -82,10 +82,10 @@ int main(int argc, char** argv)
      CHK_IF_RETURN(cudaMallocAsync(&MulGpu, sizeof(test_data) * NTT_SIZE, ntt_config.ctx.stream));
      vec_ops::VecOpsConfig config{
        ntt_config.ctx,
-        true,  // is_a_on_device
-        true,  // is_b_on_device
-        true,  // is_result_on_device
-        false  // is_async
+        true, // is_a_on_device
+        true, // is_b_on_device
+        true, // is_result_on_device
+        false // is_async
      };
      CHK_IF_RETURN(bn254_mul_cuda(GpuA, GpuB, NTT_SIZE, config, MulGpu));

--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -14,12 +14,13 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level

 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t> * constants, PoseidonConfig config)
+  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
 {
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    bn254_poseidon_hash_cuda(
+      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
  }
 }

@@ -37,11 +38,7 @@ uint32_t query_membership(scalar_t query, scalar_t* tree, const uint32_t tree_he
 }

 void generate_proof(
-  uint32_t position,
-  scalar_t* tree,
-  const uint32_t tree_height,
-  uint32_t* proof_lr,
-  scalar_t* proof_hash)
+  uint32_t position, scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, scalar_t* proof_hash)
 {
  uint32_t level_index = position;
  for (uint32_t level = tree_height - 1; level > 0; level--) {
@@ -68,7 +65,7 @@ uint32_t validate_proof(
  const uint32_t tree_height,
  const uint32_t* proof_lr,
  const scalar_t* proof_hash,
-  PoseidonConstants<scalar_t> * constants,
+  PoseidonConstants<scalar_t>* constants,
  PoseidonConfig config)
 {
  scalar_t hashes_in[2], hash_out[1], level_hash;
@@ -114,13 +111,13 @@ int main(int argc, char* argv[])
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
  PoseidonConstants<scalar_t> constants;
  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity+1); 
+  PoseidonConfig config = default_poseidon_config(data_arity + 1);
  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);

  std::cout << "3. Building Merkle tree" << std::endl;
  PoseidonConstants<scalar_t> tree_constants;
  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity+1);
+  PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
  build_tree(tree_height, tree, &tree_constants, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
@@ -142,7 +139,7 @@ int main(int argc, char* argv[])
  std::cout << "6. Tamper the hash" << std::endl;
  const scalar_t tampered_hash = hash + scalar_t::one();
  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
-  
+
  std::cout << "7. Invalidate tamper hash membership" << std::endl;
  std::cout << "Validated: " << validated << std::endl;
  return 0;
--- a/examples/golang/msm/README.md
+++ b/examples/golang/msm/README.md
@@ -0,0 +1,34 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Usage
+
+```go
+err := Msm(
+  /* Scalars input vector */ scalars,
+  /* Points input vector */ points,
+  /* MSMConfig reference */ &cfg,
+  /* Projective point result */ results)
+```
+
+In this example we use `BN254` and `BLS12377` curves. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default sizes are 2^17 - 2^22. You can change this by passing the `-l <size> -u <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> go run main.go -l=21 -u=24
+> ```
--- a/examples/golang/msm/main.go
+++ b/examples/golang/msm/main.go
@@ -0,0 +1,209 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/g2"
+	bls12377Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/msm"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254G2 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/g2"
+	bn254Msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
+)
+
+func main() {
+	var logSizeMin int
+	var logSizeMax int
+
+	flag.IntVar(&logSizeMin, "l", 17, "Minimum log size")
+	flag.IntVar(&logSizeMax, "u", 22, "Maximum log size")
+	flag.Parse()
+
+	sizeMax := 1 << logSizeMax
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254Max := bn254.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 points ... ")
+	startTime = time.Now()
+	pointsBn254Max := bn254.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BN254 G2 points ... ")
+	startTime = time.Now()
+	pointsBn254G2Max := bn254G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377Max := bls12377.GenerateScalars(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 points ... ")
+	startTime = time.Now()
+	pointsBls12377Max := bls12377.GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	print("Generating BLS12_377 G2 points ... ")
+	startTime = time.Now()
+	pointsBls12377G2Max := bls12377G2.G2GenerateAffinePoints(sizeMax)
+	println(time.Since(startTime).String())
+
+	for logSize := logSizeMin; logSize <= logSizeMax; logSize++ {
+
+		// Define the size of the problem, here 2^18.
+		size := 1 << logSize
+
+		fmt.Printf("---------------------- MSM size 2^%d=%d ------------------------\n", logSize, size)
+
+		// println(scalarsBls12377, pointsBls12377, pointsBn254G2)
+		// println(scalarsBn254, pointsBn254, pointsBls12377G2)
+
+		print("Configuring bn254 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBn254 := scalarsBn254Max[:size]
+		pointsBn254 := pointsBn254Max[:size]
+		pointsBn254G2 := pointsBn254G2Max[:size]
+
+		cfgBn254 := core.GetDefaultMSMConfig()
+		cfgBn254G2 := core.GetDefaultMSMConfig()
+		cfgBn254.IsAsync = true
+		cfgBn254G2.IsAsync = true
+
+		streamBn254, _ := cr.CreateStream()
+		streamBn254G2, _ := cr.CreateStream()
+
+		cfgBn254.Ctx.Stream = &streamBn254
+		cfgBn254G2.Ctx.Stream = &streamBn254G2
+
+		var projectiveBn254 bn254.Projective
+		var projectiveBn254G2 bn254G2.G2Projective
+
+		var msmResultBn254 core.DeviceSlice
+		var msmResultBn254G2 core.DeviceSlice
+
+		_, e := msmResultBn254.MallocAsync(projectiveBn254.Size(), projectiveBn254.Size(), streamBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBn254G2.MallocAsync(projectiveBn254G2.Size(), projectiveBn254G2.Size(), streamBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bn254 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Configuring Bls12377 MSM ... ")
+		startTime = time.Now()
+
+		scalarsBls12377 := scalarsBls12377Max[:size]
+		pointsBls12377 := pointsBls12377Max[:size]
+		pointsBls12377G2 := pointsBls12377G2Max[:size]
+
+		cfgBls12377 := core.GetDefaultMSMConfig()
+		cfgBls12377G2 := core.GetDefaultMSMConfig()
+		cfgBls12377.IsAsync = true
+		cfgBls12377G2.IsAsync = true
+
+		streamBls12377, _ := cr.CreateStream()
+		streamBls12377G2, _ := cr.CreateStream()
+
+		cfgBls12377.Ctx.Stream = &streamBls12377
+		cfgBls12377G2.Ctx.Stream = &streamBls12377G2
+
+		var projectiveBls12377 bls12377.Projective
+		var projectiveBls12377G2 bls12377G2.G2Projective
+
+		var msmResultBls12377 core.DeviceSlice
+		var msmResultBls12377G2 core.DeviceSlice
+
+		_, e = msmResultBls12377.MallocAsync(projectiveBls12377.Size(), projectiveBls12377.Size(), streamBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc failed: ", e)
+			panic(errorString)
+		}
+		_, e = msmResultBls12377G2.MallocAsync(projectiveBls12377G2.Size(), projectiveBls12377G2.Size(), streamBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"Bls12_377 Malloc G2 failed: ", e)
+			panic(errorString)
+		}
+
+		println(time.Since(startTime).String())
+
+		print("Executing bn254 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bn254Msm.Msm(scalarsBn254, pointsBn254, &cfgBn254, msmResultBn254)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bn254G2.G2Msm(scalarsBn254, pointsBn254G2, &cfgBn254G2, msmResultBn254G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bn254 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBn254Host := make(core.HostSlice[bn254.Projective], 1)
+		msmResultBn254G2Host := make(core.HostSlice[bn254G2.G2Projective], 1)
+
+		msmResultBn254Host.CopyFromDeviceAsync(&msmResultBn254, streamBn254)
+		msmResultBn254G2Host.CopyFromDeviceAsync(&msmResultBn254G2, streamBn254G2)
+
+		msmResultBn254.FreeAsync(streamBn254)
+		msmResultBn254G2.FreeAsync(streamBn254G2)
+
+		cr.SynchronizeStream(&streamBn254)
+		cr.SynchronizeStream(&streamBn254G2)
+
+		println(time.Since(startTime).String())
+
+		print("Executing Bls12377 MSM on device ... ")
+		startTime = time.Now()
+
+		e = bls12377Msm.Msm(scalarsBls12377, pointsBls12377, &cfgBls12377, msmResultBls12377)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm failed: ", e)
+			panic(errorString)
+		}
+		e = bls12377G2.G2Msm(scalarsBls12377, pointsBls12377G2, &cfgBls12377G2, msmResultBls12377G2)
+		if e != cr.CudaSuccess {
+			errorString := fmt.Sprint(
+				"bls12_377 Msm G2 failed: ", e)
+			panic(errorString)
+		}
+
+		msmResultBls12377Host := make(core.HostSlice[bls12377.Projective], 1)
+		msmResultBls12377G2Host := make(core.HostSlice[bls12377G2.G2Projective], 1)
+
+		msmResultBls12377Host.CopyFromDeviceAsync(&msmResultBls12377, streamBls12377)
+		msmResultBls12377G2Host.CopyFromDeviceAsync(&msmResultBls12377G2, streamBls12377G2)
+
+		msmResultBls12377.FreeAsync(streamBls12377)
+		msmResultBls12377G2.FreeAsync(streamBls12377G2)
+
+		cr.SynchronizeStream(&streamBls12377)
+		cr.SynchronizeStream(&streamBls12377G2)
+
+		println(time.Since(startTime).String())
+	}
+}
--- a/examples/golang/ntt/README.md
+++ b/examples/golang/ntt/README.md
@@ -0,0 +1,39 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Golang
+
+## Key-Takeaway
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Usage
+
+```go
+err := Ntt(
+  /* input slice */ scalars,
+  /* NTT Direction */ core.KForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ result)
+```
+
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT.
+2. Generate random inputs
+3. Set up the domain.
+4. Configure NTT
+5. Execute NTT on-device
+6. Move the result on host
+
+Running the example:
+
+```sh
+go run main.go
+```
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `-s <size>` option. To change the size to 2^23, run the example like this:
+
+```sh
+go run main.go -s=23
+```
--- a/examples/golang/ntt/main.go
+++ b/examples/golang/ntt/main.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"time"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
+
+	bls12377Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/ntt"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+
+	bls12377Fft "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft"
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+)
+
+func main() {
+	var logSize int
+
+	flag.IntVar(&logSize, "s", 20, "Log size")
+	flag.Parse()
+
+	size := 1 << logSize
+
+	fmt.Printf("---------------------- NTT size 2^%d=%d ------------------------\n", logSize, size)
+
+	print("Generating BN254 scalars ... ")
+	startTime := time.Now()
+	scalarsBn254 := bn254.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBn254 := bn254Ntt.GetDefaultNttConfig()
+	cfgBn254.IsAsync = true
+
+	print("Generating BLS12_377 scalars ... ")
+	startTime = time.Now()
+	scalarsBls12377 := bls12377.GenerateScalars(size)
+	println(time.Since(startTime).String())
+
+	cfgBls12377 := bls12377Ntt.GetDefaultNttConfig()
+	cfgBls12377.IsAsync = true
+
+	rouMontBn254, _ := bn254Fft.Generator(uint64(size))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	bn254Ntt.InitDomain(rouIcicleBn254, cfgBn254.Ctx, false)
+
+	rouMontBls12377, _ := bls12377Fft.Generator(uint64(size))
+	rouBls12377 := rouMontBls12377.Bits()
+	rouIcicleBls12377 := bls12377.ScalarField{}
+	limbsBls12377 := core.ConvertUint64ArrToUint32Arr(rouBls12377[:])
+	rouIcicleBls12377.FromLimbs(limbsBls12377)
+	bls12377Ntt.InitDomain(rouIcicleBls12377, cfgBls12377.Ctx, false)
+
+	print("Configuring bn254 NTT ... ")
+	startTime = time.Now()
+
+	streamBn254, _ := cr.CreateStream()
+
+	cfgBn254.Ctx.Stream = &streamBn254
+
+	var nttResultBn254 core.DeviceSlice
+
+	_, e := nttResultBn254.MallocAsync(size*scalarsBn254.SizeOfElement(), scalarsBn254.SizeOfElement(), streamBn254)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Configuring Bls12377 NTT ... ")
+	startTime = time.Now()
+
+	streamBls12377, _ := cr.CreateStream()
+
+	cfgBls12377.Ctx.Stream = &streamBls12377
+
+	var nttResultBls12377 core.DeviceSlice
+
+	_, e = nttResultBls12377.MallocAsync(size*scalarsBls12377.SizeOfElement(), scalarsBls12377.SizeOfElement(), streamBls12377)
+	if e != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"Bls12_377 Malloc failed: ", e)
+		panic(errorString)
+	}
+
+	println(time.Since(startTime).String())
+
+	print("Executing bn254 NTT on device ... ")
+	startTime = time.Now()
+
+	err := bn254Ntt.Ntt(scalarsBn254, core.KForward, &cfgBn254, nttResultBn254)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bn254 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBn254Host := make(core.HostSlice[bn254.ScalarField], size)
+	nttResultBn254Host.CopyFromDeviceAsync(&nttResultBn254, streamBn254)
+	nttResultBn254.FreeAsync(streamBn254)
+	cr.SynchronizeStream(&streamBn254)
+	println(time.Since(startTime).String())
+
+	print("Executing Bls12377 NTT on device ... ")
+	startTime = time.Now()
+
+	err = bls12377Ntt.Ntt(scalarsBls12377, core.KForward, &cfgBls12377, nttResultBls12377)
+	if err.CudaErrorCode != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"bls12_377 Ntt failed: ", e)
+		panic(errorString)
+	}
+
+	nttResultBls12377Host := make(core.HostSlice[bls12377.ScalarField], size)
+	nttResultBls12377Host.CopyFromDeviceAsync(&nttResultBls12377, streamBls12377)
+	nttResultBls12377.FreeAsync(streamBls12377)
+
+	cr.SynchronizeStream(&streamBls12377)
+
+	println(time.Since(startTime).String())
+}
--- a/examples/golang/polynomials/README.md
+++ b/examples/golang/polynomials/README.md
@@ -0,0 +1,49 @@
+# ICICLE example: Polynomials in Golang
+
+`ICICLE` provides Golang bindings to CUDA-accelerated C++ implementation of [Polynomials](https://dev.ingonyama.com/icicle/polynomials/overview).
+
+## Usage
+### Backend Initialization
+```go
+InitPolyBackend()
+```
+### Construction
+
+```go
+poly1 := CreateFromCoeffecitients(/* Coefficients of polynomial */ coeffs)
+poly2 := CreateFromROUEvaluations(/* evaluations */ evals)
+poly3 := Clone(/* polynomial to clone */ poly1)
+```
+
+### Arithmetic
+
+```go
+polyAdd := poly1.Add(&poly2)
+polySub := poly1.Subtract(&poly2)
+polyMul := poly1.Multiply(&poly2)
+polyMulScalar := MultiplyByScalar(scalar)
+quotient, remainder := poly1.Divide(&poly2)
+```
+
+### Evaluation
+
+```go
+ev := poly1.Eval(scalar)
+ev2 := poly1.EvalOnDomain(scalars)
+```
+
+In this example we use `BN254` and `Babybear` fields. The examples shows arithmetic operations and evaluations execution.
+
+## What's in the example
+
+1. Define the size of polynomials. 
+2. Initialize backends.
+3. Generate random polynomials.
+4. Execute arithmetic operations.
+5. Execute evaluations.
+6. Execute slicing.
+
+Running the example:
+```sh
+go run main.go
+```
--- a/examples/golang/polynomials/main.go
+++ b/examples/golang/polynomials/main.go
@@ -0,0 +1,114 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+
+	bn254Fft "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	bn254Ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
+	bn254Polynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/polynomial"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	babybear "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear"
+	babybearNtt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/ntt"
+	babybearPolynomial "github.com/ingonyama-zk/icicle/v2/wrappers/golang/fields/babybear/polynomial"
+)
+
+var maxNttLogSize uint
+var polyLogSize uint
+
+func initBn254Domain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouMontBn254, _ := bn254Fft.Generator(uint64(1 << maxNttLogSize))
+	rouBn254 := rouMontBn254.Bits()
+	rouIcicleBn254 := bn254.ScalarField{}
+	limbsBn254 := core.ConvertUint64ArrToUint32Arr(rouBn254[:])
+	rouIcicleBn254.FromLimbs(limbsBn254)
+	return bn254Ntt.InitDomain(rouIcicleBn254, deviceCfg, false)
+}
+
+func initBabybearDomain() core.IcicleError {
+	deviceCfg, _ := cr.GetDefaultDeviceContext()
+	rouIcicle := babybear.ScalarField{}
+	rouIcicle.FromUint32(1461624142)
+	return babybearNtt.InitDomain(rouIcicle, deviceCfg, false)
+}
+
+func init() {
+	flag.UintVar(&maxNttLogSize, "maxNttLogSize", 20, "")
+	flag.UintVar(&polyLogSize, "polyLogSize", 15, "")
+
+	e := initBn254Domain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Bn254 Domain initialization failed: ", e)
+		panic(errorString)
+	}
+	e = initBabybearDomain()
+	if e.IcicleErrorCode != core.IcicleSuccess {
+		errorString := fmt.Sprint(
+			"Babybear Domain initialization failed: ", e)
+		panic(errorString)
+	}
+
+	bn254Polynomial.InitPolyBackend()
+	babybearPolynomial.InitPolyBackend()
+}
+func main() {
+	polySize := 1 << polyLogSize
+
+	// randomize three polynomials over bn254 scalar field
+	var fBn254 bn254Polynomial.DensePolynomial
+	var gBn254 bn254Polynomial.DensePolynomial
+	var hBn254 bn254Polynomial.DensePolynomial
+	fBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize))
+	gBn254.CreateFromCoeffecitients(bn254.GenerateScalars(polySize / 2))
+	hBn254.CreateFromROUEvaluations(bn254.GenerateScalars(polySize / 4))
+
+	// randomize two polynomials over babybear field
+	var fBabybear babybearPolynomial.DensePolynomial
+	var gBabybear babybearPolynomial.DensePolynomial
+	fBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize))
+	gBabybear.CreateFromCoeffecitients(babybear.GenerateScalars(polySize / 2))
+
+	// Arithmetic
+	t0 := fBn254.Add(&gBn254)
+	t1 := fBn254.Multiply(&hBn254)
+	q, r := t1.Divide(&t0)
+	rBabybear := fBabybear.Add(&gBabybear)
+	rDegree := r.Degree()
+	_ = rBabybear
+	_ = rDegree
+
+	// evaluate in single domain point
+	var five bn254.ScalarField
+	five.FromUint32(5)
+	qAtFive := q.Eval(five)
+
+	var thirty bn254.ScalarField
+	thirty.FromUint32(30)
+
+	// evaluate on domain. Note: domain and image can be either Host or Device slice.
+	// in this example domain in on host and evals on device.
+	hostDomain := core.HostSliceFromElements([]bn254.ScalarField{five, thirty})
+	var deviceImage core.DeviceSlice
+	_, err := deviceImage.Malloc(five.Size()*hostDomain.Len(), five.Size())
+	if err != cr.CudaSuccess {
+		errorString := fmt.Sprint(
+			"deviceImage allocation failed: ", err)
+		panic(errorString)
+	}
+	t1.EvalOnDomain(hostDomain, deviceImage)
+
+	// slicing
+	o := hBn254.Odd()
+	e := hBn254.Even()
+
+	oddMult := o.MultiplyByScalar(qAtFive)
+	fold := e.Add(&oddMult) // e(x) + o(x)*scalar
+
+	coeff := fold.GetCoeff(2) // coeff of x^2
+	_ = coeff
+}
--- a/icicle/cmake/Common.cmake
+++ b/icicle/cmake/Common.cmake
@@ -14,51 +14,42 @@ endfunction()
 function(set_gpu_env)
    # add the target cuda architectures
    # each additional architecture increases the compilation time and output file size
-    if(${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    if(DEFINED CUDA_ARCH) # user defined arch takes priority
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
    else()
-    find_program(_nvidia_smi "nvidia-smi")
+        find_program(_nvidia_smi "nvidia-smi")

-    if(_nvidia_smi)
-        set(DETECT_GPU_COUNT_NVIDIA_SMI 0)
+        if(_nvidia_smi)
+            execute_process(
+                COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
+                OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            # Process the output to form the CUDA architectures string
+            string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")

-        # execute nvidia-smi -L to get a short list of GPUs available
-        exec_program(${_nvidia_smi_path} ARGS -L
-        OUTPUT_VARIABLE _nvidia_smi_out
-        RETURN_VALUE _nvidia_smi_ret)
+            set(CUDA_ARCHITECTURES "")
+            foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
+                # Remove the dot in compute capability to match CMake format
+                string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
+                if(CUDA_ARCHITECTURES)
+                    set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
+                else()
+                    set(CUDA_ARCHITECTURES "${CAPABILITY}")
+                endif()
+            endforeach()

-        # process the stdout of nvidia-smi
-        if(_nvidia_smi_ret EQUAL 0)
-        # convert string with newlines to list of strings
-        string(REGEX REPLACE "\n" ";" _nvidia_smi_out "${_nvidia_smi_out}")
-
-        foreach(_line ${_nvidia_smi_out})
-            if(_line MATCHES "^GPU [0-9]+:")
-            math(EXPR DETECT_GPU_COUNT_NVIDIA_SMI "${DETECT_GPU_COUNT_NVIDIA_SMI}+1")
-
-            # the UUID is not very useful for the user, remove it
-            string(REGEX REPLACE " \\(UUID:.*\\)" "" _gpu_info "${_line}")
-
-            if(NOT _gpu_info STREQUAL "")
-                list(APPEND DETECT_GPU_INFO "${_gpu_info}")
-            endif()
-            endif()
-        endforeach()
-
-        check_num_gpu_info(${DETECT_GPU_COUNT_NVIDIA_SMI} DETECT_GPU_INFO)
-        set(DETECT_GPU_COUNT ${DETECT_GPU_COUNT_NVIDIA_SMI})
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")        
+            set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)                        
+        else()
+            # no GPUs found, like on Github CI runners
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: 50") 
+            set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
        endif()
    endif()

-    # ##
-    if(DETECT_GPU_COUNT GREATER 0)
-        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) # do native
-    else()
-        # no GPUs found, like on Github CI runners
-        set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
-    endif()
-    endif()
-
    # Check CUDA version and, if possible, enable multi-threaded compilation 
    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
        message(STATUS "Using multi-threaded CUDA compilation.")
@@ -69,4 +60,4 @@ function(set_gpu_env)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
-endfunction()
+endfunction()
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -56,6 +56,9 @@ extern "C" cudaError_t babybear_mul_cuda(
 extern "C" cudaError_t babybear_add_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

+extern "C" cudaError_t babybear_accumulate_cuda(
+  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t babybear_sub_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

@@ -68,6 +71,12 @@ extern "C" cudaError_t babybear_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t babybear_bit_reverse_cuda(
+  const babybear::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  babybear::scalar_t* output);
+
 extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);

 extern "C" cudaError_t babybear_scalar_convert_montgomery(
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
  bls12_377::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_377_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_377_g2_msm_cuda(

 extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
  bls12_377::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_377::affine_t* output_bases);

 extern "C" cudaError_t bls12_377_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bls12_377_mul_cuda(
 extern "C" cudaError_t bls12_377_add_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

+extern "C" cudaError_t bls12_377_accumulate_cuda(
+  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_377_sub_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bls12_377_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_377_bit_reverse_cuda(
+  const bls12_377::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bls12_377::scalar_t* output);
+
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
  bls12_381::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::g2_affine_t* output_bases);

 extern "C" cudaError_t bls12_381_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bls12_381_g2_msm_cuda(

 extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
  bls12_381::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bls12_381::affine_t* output_bases);

 extern "C" cudaError_t bls12_381_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bls12_381_mul_cuda(
 extern "C" cudaError_t bls12_381_add_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

+extern "C" cudaError_t bls12_381_accumulate_cuda(
+  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bls12_381_sub_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bls12_381_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bls12_381_bit_reverse_cuda(
+  const bls12_381::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bls12_381::scalar_t* output);
+
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);

 extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -19,11 +19,8 @@

 extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
  bn254::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::g2_affine_t* output_bases);

 extern "C" cudaError_t bn254_g2_msm_cuda(
@@ -31,11 +28,8 @@ extern "C" cudaError_t bn254_g2_msm_cuda(

 extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
  bn254::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bn254::affine_t* output_bases);

 extern "C" cudaError_t bn254_msm_cuda(
@@ -136,6 +130,9 @@ extern "C" cudaError_t bn254_mul_cuda(
 extern "C" cudaError_t bn254_add_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

+extern "C" cudaError_t bn254_accumulate_cuda(
+  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bn254_sub_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

@@ -148,6 +145,12 @@ extern "C" cudaError_t bn254_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bn254_bit_reverse_cuda(
+  const bn254::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bn254::scalar_t* output);
+
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);

 extern "C" cudaError_t bn254_scalar_convert_montgomery(
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -18,11 +18,8 @@

 extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
  bw6_761::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::g2_affine_t* output_bases);

 extern "C" cudaError_t bw6_761_g2_msm_cuda(
@@ -30,11 +27,8 @@ extern "C" cudaError_t bw6_761_g2_msm_cuda(

 extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
  bw6_761::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  bw6_761::affine_t* output_bases);

 extern "C" cudaError_t bw6_761_msm_cuda(
@@ -104,6 +98,9 @@ extern "C" cudaError_t bw6_761_mul_cuda(
 extern "C" cudaError_t bw6_761_add_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

+extern "C" cudaError_t bw6_761_accumulate_cuda(
+  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t bw6_761_sub_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

@@ -116,6 +113,12 @@ extern "C" cudaError_t bw6_761_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t bw6_761_bit_reverse_cuda(
+  const bw6_761::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  bw6_761::scalar_t* output);
+
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);

 extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -17,11 +17,8 @@

 extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
  grumpkin::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  grumpkin::affine_t* output_bases);

 extern "C" cudaError_t grumpkin_msm_cuda(
@@ -74,6 +71,9 @@ extern "C" cudaError_t grumpkin_mul_cuda(
 extern "C" cudaError_t grumpkin_add_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

+extern "C" cudaError_t grumpkin_accumulate_cuda(
+  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t grumpkin_sub_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

@@ -86,6 +86,12 @@ extern "C" cudaError_t grumpkin_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t grumpkin_bit_reverse_cuda(
+  const grumpkin::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  grumpkin::scalar_t* output);
+
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);

 extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -19,6 +19,9 @@ extern "C" cudaError_t stark252_mul_cuda(
 extern "C" cudaError_t stark252_add_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

+extern "C" cudaError_t stark252_accumulate_cuda(
+  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t stark252_sub_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

@@ -31,6 +34,12 @@ extern "C" cudaError_t stark252_transpose_matrix_cuda(
  bool on_device,
  bool is_async);

+extern "C" cudaError_t stark252_bit_reverse_cuda(
+  const stark252::scalar_t* input,
+  uint64_t n,
+  vec_ops::BitReverseConfig& config,
+  stark252::scalar_t* output);
+
 extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);

 extern "C" cudaError_t stark252_scalar_convert_montgomery(
--- a/icicle/include/api/templates/curves/msm.h
+++ b/icicle/include/api/templates/curves/msm.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_precompute_msm_bases_cuda(
  ${CURVE}::affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_msm_cuda(
--- a/icicle/include/api/templates/curves/msm_g2.h
+++ b/icicle/include/api/templates/curves/msm_g2.h
@@ -1,10 +1,7 @@
 extern "C" cudaError_t ${CURVE}_g2_precompute_msm_bases_cuda(
  ${CURVE}::g2_affine_t* bases,
-  int bases_size,
-  int precompute_factor,
-  int _c,
-  bool are_bases_on_device,
-  device_context::DeviceContext& ctx,
+  int msm_size,
+  msm::MSMConfig& config,
  ${CURVE}::g2_affine_t* output_bases);

 extern "C" cudaError_t ${CURVE}_g2_msm_cuda(
--- a/icicle/include/api/templates/fields/vec_ops.h
+++ b/icicle/include/api/templates/fields/vec_ops.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda(
 extern "C" cudaError_t ${FIELD}_add_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

+extern "C" cudaError_t ${FIELD}_accumulate_cuda(
+  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_sub_cuda(
  ${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

--- a/icicle/include/api/templates/fields/vec_ops_ext.h
+++ b/icicle/include/api/templates/fields/vec_ops_ext.h
@@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda(
 extern "C" cudaError_t ${FIELD}_extension_add_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

+extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda(
+  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
 extern "C" cudaError_t ${FIELD}_extension_sub_cuda(
  ${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

--- a/icicle/include/curves/projective.cuh
+++ b/icicle/include/curves/projective.cuh
@@ -175,7 +175,7 @@ public:
    UNROLL
 #endif
    for (int i = 0; i < SCALAR_FF::NBITS; i++) {
-      if (i > 0) { res = res + res; }
+      if (i > 0) { res = dbl(res); }
      if (scalar.get_scalar_digit(SCALAR_FF::NBITS - i - 1, 1)) { res = res + point; }
    }
    return res;
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -1001,6 +1001,17 @@ public:
    }
    return (u == one) ? b : c;
  }
+
+  static constexpr HOST_DEVICE_INLINE Field pow(Field base, int exp)
+  {
+    Field res = one();
+    while (exp > 0) {
+      if (exp & 1) res = res * base;
+      base = base * base;
+      exp >>= 1;
+    }
+    return res;
+  }
 };

 template <class CONFIG>
--- a/icicle/include/msm/msm.cuh
+++ b/icicle/include/msm/msm.cuh
@@ -43,7 +43,7 @@ namespace msm {
                              *   points, it should be set to the product of MSM size and [batch_size](@ref
                              *   batch_size). Default value: 0 (meaning it's equal to the MSM size). */
    int precompute_factor;   /**< The number of extra points to pre-compute for each point. See the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function, `precompute_factor` passed
+                              *   [precompute_msm_points](@ref precompute_msm_points) function, `precompute_factor` passed
                              *   there needs to be equal to the one used here. Larger values decrease the
                              *   number of computations to make, on-line memory footprint, but increase the static
                              *   memory footprint. Default value: 1 (i.e. don't pre-compute). */
@@ -52,7 +52,7 @@ namespace msm {
                              *   means more on-line memory footprint but also more parallelism and less computational
                              *   complexity (up to a certain point). Currently pre-computation is independent of
                              *   \f$ c \f$, however in the future value of \f$ c \f$ here and the one passed into the
-                              *   [precompute_msm_bases](@ref precompute_msm_bases) function will need to be identical.
+                              *   [precompute_msm_points](@ref precompute_msm_points) function will need to be identical.
                              *    Default value: 0 (the optimal value of \f$ c \f$ is chosen automatically).  */
    int bitsize;             /**< Number of bits of the largest scalar. Typically equals the bitsize of scalar field,
                              *   but if a different (better) upper bound is known, it should be reflected in this
@@ -127,6 +127,26 @@ namespace msm {
  template <typename S, typename A, typename P>
  cudaError_t msm(const S* scalars, const A* points, int msm_size, MSMConfig& config, P* results);

+  /**
+   * A function that precomputes MSM bases by extending them with their shifted copies.
+   * e.g.:
+   * Original points: \f$ P_0, P_1, P_2, ... P_{size} \f$
+   * Extended points: \f$ P_0, P_1, P_2, ... P_{size}, 2^{l}P_0, 2^{l}P_1, ..., 2^{l}P_{size},
+   * 2^{2l}P_0, 2^{2l}P_1, ..., 2^{2cl}P_{size}, ... \f$
+   * @param points Points \f$ P_i \f$. In case of batch MSM, all *unique* points are concatenated.
+   * @param msm_size MSM size \f$ N \f$. If a batch of MSMs (which all need to have the same size) is computed, this is
+   * the size of 1 MSM.
+   * @param config [MSMConfig](@ref MSMConfig) used in this MSM.
+   * @param output_points Device-allocated buffer of size config.points_size * precompute_factor for the extended
+   * points.
+   * @tparam A The type of points \f$ \{P_i\} \f$ which is typically an [affine
+   * Weierstrass](https://hyperelliptic.org/EFD/g1p/auto-shortw.html) point.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   *
+   */
+  template <typename A, typename P>
+  cudaError_t precompute_msm_points(A* points, int msm_size, msm::MSMConfig& config, A* output_points);
+
  /**
   * A function that precomputes MSM bases by extending them with their shifted copies.
   * e.g.:
@@ -148,7 +168,7 @@ namespace msm {
   *
   */
  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
+  [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
    A* bases,
    int bases_size,
    int precompute_factor,
--- a/icicle/include/polynomials/polynomial_backend.h
+++ b/icicle/include/polynomials/polynomial_backend.h
@@ -56,6 +56,7 @@ namespace polynomials {
    // Evaluation methods
    virtual void evaluate(PolyContext op, const D* domain_x, I* eval /*OUT*/) = 0;
    virtual void evaluate_on_domain(PolyContext op, const D* domain, uint64_t size, I* evaluations /*OUT*/) = 0;
+    virtual void evaluate_on_rou_domain(PolyContext op, uint64_t domain_log_size, I* evals /*OUT*/) = 0;

    // Methods to copy coefficients to host memory
    virtual C get_coeff(PolyContext op, uint64_t coeff_idx) = 0;
@@ -64,8 +65,6 @@ namespace polynomials {
    // Methods to get views of coefficients and evaluations, including device id
    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
    get_coefficients_view(PolyContext p) = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
  };

 } // namespace polynomials
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -71,10 +71,8 @@ namespace polynomials {
    virtual std::pair<const C*, uint64_t> get_coefficients() = 0;
    virtual std::pair<const I*, uint64_t> get_rou_evaluations() = 0;

-    // Methods to get views of coefficients and evaluations, including device id.
+    // Methods to get views of coefficients
    virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view() = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;

    // Method for printing the context state to an output stream.
    virtual void print(std::ostream& os) = 0;
--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -68,6 +68,7 @@ namespace polynomials {
    Image operator()(const Domain& x) const;
    void evaluate(const Domain* x, Image* eval /*OUT*/) const;
    void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+    void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory

    // Method to obtain the degree of the polynomial
    int64_t degree();
@@ -77,10 +78,8 @@ namespace polynomials {
    // caller is allocating output memory. If coeff==nullptr, returning nof_coeff only
    uint64_t copy_coeffs(Coeff* host_coeffs, uint64_t start_idx, uint64_t end_idx) const;

-    // Methods for obtaining a view of the coefficients or evaluations
+    // Methods for obtaining a view of the coefficients
    std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-    std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);

    // Overload stream insertion operator for printing.
    friend std::ostream& operator<<(std::ostream& os, Polynomial& poly)
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -113,6 +113,27 @@ namespace vec_ops {
    device_context::DeviceContext& ctx,
    bool on_device,
    bool is_async);
+
+  struct BitReverseConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream. */
+    bool is_input_on_device;  /**< True if `input` is on device and false if it is not. Default value: false. */
+    bool is_output_on_device; /**< True if `output` is on device and false if it is not. Default value: false. */
+    bool is_async; /**< Whether to run the vector operations asynchronously. If set to `true`, the function will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
+                    *   function will block the current CPU thread. */
+  };
+  static BitReverseConfig
+  DefaultBitReverseConfig(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    BitReverseConfig config = {
+      ctx,   // ctx
+      false, // is_input_on_device
+      false, // is_output_on_device
+      false, // is_async
+    };
+    return config;
+  }
 } // namespace vec_ops

 #endif
--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -24,6 +24,7 @@ set(POLYNOMIAL_SOURCE_FILES

 # TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
 if (DEFINED CURVE)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon/extern.cu)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
 endif()
--- a/icicle/src/mini-course-examples/Makefile
+++ b/icicle/src/mini-course-examples/Makefile
@@ -0,0 +1,44 @@
+build_test:
+	mkdir -p work
+	nvcc -o work/test -std=c++17 -arch=sm_80 -I. -I../../include test.cu
+
+run_test:
+	mkdir -p work
+	nvcc -o work/test -std=c++17 -arch=sm_80 -I.  -I../../include test.cu
+	work/test
+
+
+build_perf:
+	mkdir -p work
+	nvcc -lineinfo -o work/perf -std=c++17 -arch=sm_80 -I. -I../../include perf_test.cu
+
+run_perf:
+	make build_perf
+	work/perf
+
+
+build_mem:
+	mkdir -p work
+	nvcc -lineinfo -o work/mem -std=c++17 -arch=sm_80 -I. -I../../include memory_test.cu
+
+run_mem:
+	make build_mem
+	work/mem
+
+
+build_transpose:
+	mkdir -p work
+	nvcc -lineinfo -o work/transpose -std=c++17 -arch=sm_80 -I. -I../../include transpose_test.cu
+
+run_transpose:
+	make build_transpose
+	work/transpose
+
+
+build_compute:
+	mkdir -p work
+	nvcc -lineinfo -o work/compute -std=c++17 -arch=sm_80 -I. -I../../include compute_test.cu
+
+run_compute:
+	make build_compute
+	work/compute
--- a/icicle/src/mini-course-examples/compute_test.cu
+++ b/icicle/src/mini-course-examples/compute_test.cu
@@ -0,0 +1,130 @@
+#include "fields/id.h"
+// #define FIELD_ID 1001
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <random>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+#include "kernels.cu"
+
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;
+
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;
+
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
+};
+
+
+// typedef field_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;
+
+// typedef uint32_t test_t;
+// typedef int4 test_t;
+// typedef Dummy_Scalar test_t;
+typedef test_projective test_t;
+// typedef test_scalar test_t;
+
+#define REPS 8
+
+int main()
+{
+
+  cudaEvent_t start, stop;
+  float kernel_time;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  int N = 1<<22;
+  // int N = 1<<25;
+  
+  test_t* arr1_h = new test_t[N];
+  test_t* arr2_h = new test_t[N];
+
+  test_t *arr1_d, *arr2_d;
+  
+  cudaMalloc(&arr1_d, N*sizeof(test_t));
+  cudaMalloc(&arr2_d, N*sizeof(test_t));
+
+  for (int i = 0; i < N; i++)
+  {
+    arr1_h[i] = i > 100? arr1_h[i-100] : test_t::rand_host();
+    // arr1_h[i] = i > 100? arr1_h[i-100] : rand();
+  }
+  
+  cudaMemcpy(arr1_d, arr1_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
+
+  int THREADS = 128;
+  int BLOCKS = (N + THREADS - 1)/THREADS;
+  
+  //warm up
+  add_many_times<test_t,16><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // multi_mult<test_t,8><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  cudaEventRecord(start, 0);
+
+  // add_many_times<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // multi_add<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // limb_mult_bench<REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  segment_sum<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, N);
+  // shmem_segment_sum<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, N);
+  // multi_mult<test_t,REPS><<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // multi_ntt8<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+  cudaEventRecord(stop, 0);
+  cudaStreamSynchronize(0);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+  printf("kernel_time : %.3f ms.\n", kernel_time);
+  // printf("normalized kernel_time : %.3f ms.\n", kernel_time/REPS);
+
+  return 0;
+}
--- a/icicle/src/mini-course-examples/kernels.cu
+++ b/icicle/src/mini-course-examples/kernels.cu
@@ -0,0 +1,457 @@
+
+template <class T>
+__global__ void add_elements_kernel(const T* x, const T* y, T* result, const unsigned count)
+{
+  const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= count) return;
+  result[tid] = x[tid] + y[tid];
+}
+
+template <class T>
+__global__ void fake_ntt_kernel(const T* x, T* result, const unsigned thread_count)
+{
+  extern __shared__ T shmem[];
+  const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= thread_count) return;
+  shmem[4*threadIdx.x] = x[4*tid] + x[4*tid+1];
+  shmem[4*threadIdx.x+1] = x[4*tid] + T::neg(x[4*tid+1]);
+  shmem[4*threadIdx.x+2] = x[4*tid+2] + x[4*tid+3];
+  shmem[4*threadIdx.x+3] = x[4*tid+2] + T::neg(x[4*tid+3]);
+  __syncthreads();
+  result[4*tid] = shmem[2*threadIdx.x] + shmem[2*threadIdx.x + 4*blockDim.x/2];
+  result[4*tid+1] = shmem[2*threadIdx.x] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x/2]);
+  result[4*tid+2] = shmem[2*threadIdx.x+1] + shmem[2*threadIdx.x + 4*blockDim.x/2+1];
+  result[4*tid+3] = shmem[2*threadIdx.x+1] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x/2+1]);
+}
+
+
+template <class T>
+__global__ void bugged_add_elements_kernel(const T* x, const T* y, T* result, const unsigned count)
+{
+  const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+  // if (tid >= count) return;
+  // printf("tid %d\n", tid);
+  result[tid] = x[tid] + y[tid];
+}
+
+template <class T>
+__global__ void bugged_fake_ntt_kernel(const T* x, T* result, const unsigned thread_count)
+{
+  extern __shared__ T shmem[];
+  const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+  
+  // if (tid >= thread_count) return;
+  // if (tid == 0){
+  //   for (int i = 0; i < 8; i++)
+  //   {
+  //     shmem[i]=T::zero();
+  //   }
+  // }
+
+  shmem[4*threadIdx.x] = x[4*tid] + x[4*tid+1];
+  shmem[4*threadIdx.x+1] = x[4*tid] + T::neg(x[4*tid+1]);
+  shmem[4*threadIdx.x+2] = x[4*tid+2] + x[4*tid+1];
+  shmem[4*threadIdx.x+4] = x[4*tid+2] + T::neg(x[4*tid+1]);
+
+  __syncthreads();
+
+  // if (tid == 0){
+  //   for (int i = 0; i < 8; i++)
+  //   {
+  //     printf("%d ",shmem[i]);
+  //   }
+  //   printf("\n");
+  // }
+
+  // printf("tid: %d, addr1: %d, addr2: %d\n", tid, 2*threadIdx.x, 2*threadIdx.x + 4*blockDim.x);
+  result[4*tid] = shmem[2*threadIdx.x] + shmem[2*threadIdx.x + 4*blockDim.x];  // Incorrect offset
+  result[4*tid+1] = shmem[2*threadIdx.x] + T::neg(shmem[2*threadIdx.x + 4*blockDim.x]);  // Incorrect offset
+  result[4*tid+2] = shmem[2*threadIdx.x+1] + shmem[2*threadIdx.x + 4*blockDim.x+1];  // Incorrect offset
+  result[4*tid+3] = shmem[2*threadIdx.x+1] + T::neg(shmem[2*threadIdx.x +4*blockDim.x+1]);  // Incorrect offset
+}
+
+template <class T>
+__global__ void bucket_acc_naive(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid >= nof_buckets) return;
+  for (int i = 0; i < sizes[tid]; i++)
+  {
+    buckets[indices[tid]] = buckets[indices[tid]] + buckets[indices[tid]];
+  }
+}
+
+template <class T>
+__global__ void bucket_acc_memory_baseline(T* buckets1, T* buckets2, unsigned* indices, unsigned nof_buckets){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= nof_buckets) return;
+  buckets2[indices[tid]] = buckets1[indices[tid]];
+}
+
+template <class T>
+__global__ void bucket_acc_compute_baseline(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= nof_buckets) return;
+  T bucket = buckets[indices[tid]];
+  for (int j = 0; j < 100; j++)
+  {
+    for (int i = 0; i < sizes[tid]; i++)
+    {
+      bucket = bucket + bucket;
+    }
+  }
+  buckets[indices[tid]] = bucket;
+}
+
+template <class T>
+__global__ void bucket_acc_reg(T* buckets, unsigned* indices, unsigned* sizes, unsigned nof_buckets){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= nof_buckets) return;
+  T bucket = buckets[indices[tid]];
+  for (int i = 0; i < sizes[tid]; i++)
+  {
+    bucket = bucket + bucket;
+  }
+  buckets[indices[tid]] = bucket;
+}
+
+
+// #define NOF_TH 32*64
+
+
+template <class T, int SIZE_T>
+__global__ void device_memory_copy(void* arr1_raw, void* arr2_raw, unsigned size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= size/SIZE_T) return;
+  T* arr1=(T*)arr1_raw;
+  T* arr2=(T*)arr2_raw;
+  arr2[tid] = arr1[tid];
+}
+
+template <class T, int SIZE_T>
+__global__ void segmented_memory_copy(void* arr1_raw, void* arr2_raw, unsigned size, unsigned read_segment_length, unsigned nof_write_segments){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int nof_elements = size/SIZE_T;
+  int write_segment_length = nof_elements / nof_write_segments;
+  int r_segment_idx = tid / read_segment_length;
+  int r_segment_tid = tid % read_segment_length;
+  int w_segment_idx = r_segment_idx % nof_write_segments;
+  int w_segment_tid = r_segment_idx / nof_write_segments;
+  int addr = w_segment_idx * write_segment_length + w_segment_tid * read_segment_length + r_segment_tid;
+  // if (tid < 50) printf("tid %d, addr %d\n", tid, addr);
+  if (tid >= nof_elements) return;
+  T* arr1=(T*)arr1_raw;
+  T* arr2=(T*)arr2_raw;
+  arr2[addr] = arr1[addr];
+}
+
+
+template <class T, int SIZE_T>
+__global__ void multi_memory_copy1(void* arr1_raw, void* arr2_raw, unsigned size, unsigned nof_elements_per_thread){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int nof_elements = size/SIZE_T;
+  int segment_length = nof_elements / nof_elements_per_thread;
+  if (tid >= segment_length) return;
+  T* arr1=(T*)arr1_raw;
+  T* arr2=(T*)arr2_raw;
+  for (int i = 0; i < nof_elements_per_thread; i++)
+  {
+    arr2[tid + i*segment_length] = arr1[tid + i*segment_length];
+  }
+}
+
+template <class T, int SIZE_T>
+__global__ void multi_memory_copy2(void* arr1_raw, void* arr2_raw, unsigned size, unsigned nof_elements_per_thread){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int nof_elements = size/SIZE_T;
+  int nof_threads = nof_elements / nof_elements_per_thread;
+  if (tid >= nof_threads) return;
+  T* arr1=(T*)arr1_raw;
+  T* arr2=(T*)arr2_raw;
+  for (int i = 0; i < nof_elements_per_thread; i++)
+  {
+    arr2[tid*nof_elements_per_thread + i] = arr1[tid*nof_elements_per_thread + i];
+  }
+}
+
+template <class T>
+__global__ void simple_memory_copy(T* in, T* out, unsigned size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= size) return;
+  out[tid] = in[tid];
+}
+
+template <class T>
+__global__ void naive_transpose_write(T *in, T *out, int row_length){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= row_length * row_length) return;
+  int row_id = tid / row_length;
+  int col_id = tid % row_length;
+  out[col_id * row_length + row_id] = in[tid];
+}
+
+template <class T>
+__global__ void naive_transpose_read(T *in, T *out, int row_length){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= row_length * row_length) return;
+  int row_id = tid / row_length;
+  int col_id = tid % row_length;
+  out[tid] = in[col_id * row_length + row_id];
+}
+
+
+template <class T>
+__global__ void shmem_transpose(T *in, T *out, int row_length){
+  __shared__ T shmem[16][16];
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= row_length * row_length) return;
+  int shmem_col_id = threadIdx.x / 16;
+  int shmem_row_id = threadIdx.x % 16;
+  int blocks_per_row = row_length / 16;
+  int block_row_id = blockIdx.x / blocks_per_row;
+  int block_col_id = blockIdx.x % blocks_per_row;
+  // shmem[shmem_col_id][shmem_row_id] = in[block_row_id*row_length*16 + block_col_id*16 + shmem_col_id*row_length + shmem_row_id];
+  shmem[shmem_row_id][shmem_col_id] = in[block_row_id*row_length*16 + block_col_id*16 + shmem_col_id*row_length + shmem_row_id];
+  __syncthreads();
+  // out[block_col_id*row_length*16 + block_row_id*16 + shmem_col_id*row_length + shmem_row_id] = shmem[shmem_row_id][shmem_col_id];
+  out[block_col_id*row_length*16 + block_row_id*16 + shmem_col_id*row_length + shmem_row_id] = shmem[shmem_col_id][shmem_row_id];
+}
+
+template <class T, int REPS>
+__global__ void add_many_times(T *in, T *out, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= size) return;
+  T temp;
+  #pragma unroll
+  for (int i = 0; i < REPS; i++)
+  {
+    temp = i? temp + temp : in[tid];
+  }
+  out[tid] = temp;
+}
+
+
+template <class T, int REPS>
+__global__ void multi_add(T *in, T *out, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int segment_length = size / REPS;
+  if (tid >= segment_length) return;
+  // #pragma unroll 1
+  for (int i = 0; i < REPS; i++)
+  {
+    out[tid + i*segment_length] = in[tid + i*segment_length] + in[tid + i*segment_length];
+  }
+}
+
+template <class T, int SEG_SIZE>
+__global__ void segment_sum(T *inout, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int nof_segments = size / SEG_SIZE;
+  if (tid >= nof_segments) return;
+  T sum = T::zero();
+  T sums_sum = T::zero();
+  for (int i = 0; i < SEG_SIZE; i++)
+  {
+    sums_sum = sums_sum + sum;
+    sum = sum + inout[tid * SEG_SIZE + i];
+  }
+  inout[tid * SEG_SIZE] = sums_sum;
+  // inout[tid * SEG_SIZE] = sum;
+}
+
+template <class T, int SEG_SIZE>
+__global__ void shmem_segment_sum(T *inout, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int nof_segments = size / SEG_SIZE;
+  if (tid >= nof_segments) return;
+  __shared__ T shmem[128*2];
+  // T sum = T::zero();
+  // T sums_sum = T::zero();
+  shmem[2*threadIdx.x] = T::zero(); //sum
+  shmem[2*threadIdx.x + 1] = T::zero(); //sums_sum
+  for (int i = 0; i < SEG_SIZE; i++)
+  {
+    {T sum = shmem[2*threadIdx.x];
+    T sums_sum = shmem[2*threadIdx.x + 1];
+    shmem[2*threadIdx.x + 1] = sums_sum + sum;}
+    // {T sum = shmem[2*(127-threadIdx.x)];
+    // T sums_sum = shmem[2*(127-threadIdx.x) + 1];
+    // shmem[2*(127-threadIdx.x) + 1] = sums_sum + sum;}
+    // shmem[2*(127-threadIdx.x) + 1] = shmem[2*(127-threadIdx.x) + 1] + shmem[2*(127-threadIdx.x)];
+    // shmem[2*threadIdx.x + 1] = shmem[2*threadIdx.x + 1] + shmem[2*threadIdx.x];
+    // __syncthreads();
+    {T sum = shmem[2*threadIdx.x];
+    T sums_sum = inout[tid * SEG_SIZE + i];
+    shmem[2*threadIdx.x] = sum + sums_sum;}
+    // shmem[2*threadIdx.x] = shmem[2*threadIdx.x] + inout[tid * SEG_SIZE + i];
+    // __syncthreads();
+  }
+  inout[tid * SEG_SIZE] = shmem[2*threadIdx.x + 1];
+  // inout[tid * SEG_SIZE] = sum;
+}
+
+template <class T, int REPS>
+__global__ void multi_mult(T *in, T *out, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int segment_length = size / REPS;
+  if (tid >= segment_length) return;
+  #pragma unroll 1
+  for (int i = 0; i < REPS; i++)
+  {
+    out[tid + i*segment_length] = in[tid + i*segment_length] * in[tid + i*segment_length];
+  }
+}
+
+
+template <class E>
+DEVICE_INLINE void ntt8opt(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
+  {
+    E T;
+
+    T = X3 - X7;
+    X7 = X3 + X7;
+    X3 = X1 - X5;
+    X5 = X1 + X5;
+    X1 = X2 + X6;
+    X2 = X2 - X6;
+    X6 = X0 + X4;
+    X0 = X0 - X4;
+
+    X4 = X6 + X1;
+    X6 = X6 - X1;
+    X1 = X3 + T;
+    X3 = X3 - T;
+    T = X5 + X7;
+    X5 = X5 - X7;
+    X7 = X0 + X2;
+    X0 = X0 - X2;
+
+    X2 = X6 + X5;
+    X6 = X6 - X5;
+    X5 = X7 - X1;
+    X1 = X7 + X1;
+    X7 = X0 - X3;
+    X3 = X0 + X3;
+    X0 = X4 + T;
+    X4 = X4 - T;
+  }
+
+
+  template <class E>
+DEVICE_INLINE void ntt8(E& X0, E& X1, E& X2, E& X3, E& X4, E& X5, E& X6, E& X7)
+  {
+    E Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7;
+
+    Y0 = X0 + X4;
+    Y1 = X0 - X4;
+    Y2 = X1 - X5;
+    Y3 = X1 + X5;
+    Y4 = X2 + X6;
+    Y5 = X2 - X6;
+    Y6 = X3 - X7;
+    Y7 = X3 + X7;
+
+    X0 = Y0 + Y2;
+    X1 = Y0 - Y2;
+    X2 = Y1 - Y3;
+    X3 = Y1 + Y3;
+    X4 = Y4 + Y6;
+    X5 = Y4 - Y6;
+    X6 = Y5 - Y7;
+    X7 = Y5 + Y7;
+
+    Y0 = X0 + X1;
+    Y1 = X0 - X1;
+    Y2 = X2 - X3;
+    Y3 = X2 + X3;
+    Y4 = X4 + X5;
+    Y5 = X4 - X5;
+    Y6 = X6 - X7;
+    Y7 = X6 + X7;
+
+    X0 = Y0;
+    X1 = Y1;
+    X2 = Y2;
+    X3 = Y3;
+    X4 = Y4;
+    X5 = Y5;
+    X6 = Y6;
+    X7 = Y7;
+  }
+
+
+
+template <class T>
+__global__ void multi_ntt8(T *in, T *out, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  int segment_length = size / 8;
+  if (tid >= segment_length) return;
+  T X[8];
+  #pragma unroll
+  for (int i = 0; i < 8; i++)
+  {
+    X[i] = in[tid + i*segment_length];
+  }
+  // ntt8(X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7]);
+  ntt8opt(X[0],X[1],X[2],X[3],X[4],X[5],X[6],X[7]);
+  #pragma unroll
+  for (int i = 0; i < 8; i++)
+  {
+    out[tid + i*segment_length] = X[i];
+  }
+}
+
+
+__device__ void mul_naive(uint32_t *a, uint32_t *b, uint32_t *r){
+    __align__(8) uint32_t odd[2];
+    r[0] = ptx::mul_lo(a[0], b[0]);
+    r[1] = ptx::mul_hi(a[0], b[0]);
+    r[1] = ptx::mad_lo(a[0], b[1], r[1]);
+    r[1] = ptx::mad_lo(a[1], b[0], r[1]);
+    r[2] = ptx::mul_lo(a[1], b[1]);
+    r[2] = ptx::mad_hi(a[1], b[0], r[2]);
+    r[2] = ptx::mad_hi(a[0], b[1], r[2]);
+    r[3] = ptx::mul_hi(a[1], b[1]);
+  
+    r[0] = ptx::add_cc(r[0], r[1]);
+    r[1] = ptx::add_cc(r[2], r[3]);
+}
+
+__device__ void mul_icicle(uint32_t *a, uint32_t *b, uint32_t *r){
+    __align__(8) uint32_t odd[2];
+    r[0] = ptx::mul_lo(a[0], b[0]);
+    r[1] = ptx::mul_hi(a[0], b[0]);
+    r[2] = ptx::mul_lo(a[1], b[1]);
+    r[3] = ptx::mul_hi(a[1], b[1]);
+    odd[0] = ptx::mul_lo(a[0], b[1]);
+    odd[1] = ptx::mul_hi(a[0], b[1]);
+    odd[0] = ptx::mad_lo(a[1], b[0], odd[0]);
+    odd[1] = ptx::mad_hi(a[1], b[0], odd[1]);
+    r[1] = ptx::add_cc(r[1], odd[0]);
+    r[2] = ptx::addc_cc(r[2], odd[1]);
+    r[3] = ptx::addc(r[3], 0);
+
+    r[0] = ptx::add_cc(r[0], r[1]);
+    r[1] = ptx::add_cc(r[2], r[3]);
+}
+
+
+
+template <int REPS>
+__global__ void limb_mult_bench(uint32_t *in, uint32_t *out, int size){
+  int tid = blockDim.x*blockIdx.x + threadIdx.x;
+  if (tid >= size/2) return;
+  uint32_t res[4];
+  res[0] = in[tid];
+  res[1] = in[tid + size/2];
+  // typename T::Wide temp;
+  for (int i = 0; i < REPS; i++)
+  {
+    mul_naive(res, res, res);
+    // mul_icicle(res, res, res);
+    // T::multiply_raw_device(res.limbs_storage, res.limbs_storage, res.limbs_storage);
+    // temp = T::mul_wide(res, res);
+  }
+  // out[tid] = T::reduce(temp);
+  out[tid] = res[0];
+  out[tid + size/2] = res[1];
+}
--- a/icicle/src/mini-course-examples/memory_test.cu
+++ b/icicle/src/mini-course-examples/memory_test.cu
@@ -0,0 +1,114 @@
+#include "fields/id.h"
+// #define FIELD_ID 1
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <random>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+#include "kernels.cu"
+
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;
+
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;
+
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
+};
+
+
+// typedef field_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;
+
+typedef int test_t;
+// typedef int4 test_t;
+// typedef Dummy_Scalar test_t;
+// typedef test_projective test_t;
+// typedef test_scalar test_t;
+
+int main()
+{
+
+  cudaEvent_t start, stop;
+  float kernel_time;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  int N = 1<<25;
+  
+  void *arr1, *arr2;
+  
+  cudaMalloc(&arr1, N);
+  cudaMalloc(&arr2, N);
+
+  int THREADS = 256;
+  int BLOCKS = (N/sizeof(test_t) + THREADS - 1)/THREADS;
+  
+  //warm up
+  device_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N);
+  segmented_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, 32, 1024);
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  cudaEventRecord(start, 0);
+
+  device_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N);
+  // segmented_memory_copy<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, 2, 1024);
+  // int elements_per_thread = 8;
+  // BLOCKS = (N/sizeof(test_t)/elements_per_thread + THREADS - 1)/THREADS;
+  // multi_memory_copy1<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, elements_per_thread);
+  // multi_memory_copy2<test_t, sizeof(test_t)><<<BLOCKS, THREADS>>>(arr1, arr2, N, elements_per_thread);
+  
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+  cudaEventRecord(stop, 0);
+  cudaStreamSynchronize(0);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+  printf("kernel_time : %.3f ms.\n", kernel_time);
+
+  return 0;
+}
--- a/icicle/src/mini-course-examples/perf_test.cu
+++ b/icicle/src/mini-course-examples/perf_test.cu
@@ -0,0 +1,199 @@
+#include "fields/id.h"
+// #define FIELD_ID 1001
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <random>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+#include "kernels.cu"
+
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;
+
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;
+
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
+};
+
+
+// typedef field_config::scalar_t test_scalar;
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;
+
+// typedef int test_t;
+// typedef int4 test_t;
+// typedef Dummy_Scalar test_t;
+// typedef test_projective test_t;
+typedef test_scalar test_t;
+
+int main()
+{
+
+  cudaEvent_t start, stop;
+  float kernel_time;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  int N = 1<<20;
+  // int N = 1<<3;
+
+  test_t* buckets_h = new test_t[N];
+  unsigned* indices_h = new unsigned[N];
+  unsigned* sizes_h = new unsigned[N];
+
+  for (int i = 0; i < N; i++)
+  {
+    indices_h[i] = static_cast<unsigned>(i);
+    sizes_h[i] = static_cast<unsigned>(std::rand())%20;
+    // sizes_h[i] = 10;
+    buckets_h[i] = i<100? test_t::rand_host() : buckets_h[i-100];
+    // buckets_h[i] = i<100? rand() : buckets_h[i-100];
+    // buckets_h[i].x = i<100? rand() : buckets_h[i-100].x;
+    // buckets_h[i].y = i<100? rand() : buckets_h[i-100].y;
+    // buckets_h[i].z = i<100? rand() : buckets_h[i-100].z;
+    // buckets_h[i].w = i<100? rand() : buckets_h[i-100].w;
+    // if (i<10) std::cout << indices_h[i] << " " << sizes_h[i] << " " << buckets_h[i] << std::endl;
+  }
+  
+  test_t *buckets_d, *buckets2_d;
+  unsigned *sizes_d, *indices_d;
+
+  cudaMalloc(&buckets_d, sizeof(test_t) * N);
+  cudaMalloc(&buckets2_d, sizeof(test_t) * N);
+  cudaMalloc(&sizes_d, sizeof(unsigned) * N);
+  cudaMalloc(&indices_d, sizeof(unsigned) * N);
+  
+  cudaMemcpy(buckets_d, buckets_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(sizes_d, sizes_h, sizeof(unsigned) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(indices_d, indices_h, sizeof(unsigned) * N, cudaMemcpyHostToDevice);
+
+  int THREADS = 256;
+  int BLOCKS = (N + THREADS - 1)/THREADS;
+  
+  //warm up
+  bucket_acc_naive<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  cudaEventRecord(start, 0);
+
+  
+  // unsigned* sorted_sizes;
+  // cudaMalloc(&sorted_sizes, sizeof(unsigned) * N);
+
+  // unsigned* sorted_indices;
+  // cudaMalloc(&sorted_indices, sizeof(unsigned) * N);
+  // unsigned* sort_indices_temp_storage{};
+  // size_t sort_indices_temp_storage_bytes = 0;
+  // cub::DeviceRadixSort::SortPairsDescending(
+  //   sort_indices_temp_storage, sort_indices_temp_storage_bytes, sizes_d,
+  //   sorted_sizes, indices_d, sorted_indices, N, 0);
+  // cudaMalloc(&sort_indices_temp_storage, sort_indices_temp_storage_bytes);
+  // cub::DeviceRadixSort::SortPairsDescending(
+  //   sort_indices_temp_storage, sort_indices_temp_storage_bytes, sizes_d,
+  //   sorted_sizes, indices_d, sorted_indices, N, 0);
+  // cudaFree(sort_indices_temp_storage);
+  
+  // test_t* sorted_buckets;
+  // cudaMalloc(&sorted_buckets, sizeof(test_t) * N);
+  // unsigned* sort_buckets_temp_storage{};
+  // size_t sort_buckets_temp_storage_bytes = 0;
+  // cub::DeviceRadixSort::SortPairsDescending(
+  //   sort_buckets_temp_storage, sort_buckets_temp_storage_bytes, sizes_d,
+  //   sorted_sizes, buckets_d, sorted_buckets, N, 0);
+  // cudaMalloc(&sort_buckets_temp_storage, sort_buckets_temp_storage_bytes);
+  // cub::DeviceRadixSort::SortPairsDescending(
+  //   sort_buckets_temp_storage, sort_buckets_temp_storage_bytes, sizes_d,
+  //   sorted_sizes, buckets_d, sorted_buckets, N, 0);
+  // cudaFree(sort_buckets_temp_storage);
+
+  // cudaEventRecord(start, 0);
+
+  bucket_acc_naive<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
+  // bucket_acc_compute_baseline<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
+  // bucket_acc_memory_baseline<<<BLOCKS, THREADS>>>(buckets_d, buckets2_d, indices_d, N);
+  // bucket_acc_reg<<<BLOCKS, THREADS>>>(buckets_d, indices_d, sizes_d, N);
+  // bucket_acc_reg<<<BLOCKS, THREADS>>>(buckets_d, sorted_indices, sorted_sizes, N);
+  // bucket_acc_reg<<<BLOCKS, THREADS>>>(sorted_buckets, indices_d, sorted_sizes, N);
+
+  // simple_memory_copy<<<64, 32>>>(buckets_d, buckets2_d, N);
+  // simple_memory_copy<<<BLOCKS, THREADS>>>(buckets_d, buckets2_d, N);
+  
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+  cudaEventRecord(stop, 0);
+  cudaStreamSynchronize(0);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+  printf("kernel_time : %.3f ms.\n", kernel_time);
+
+  cudaMemcpy(buckets_h, buckets_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
+  // cudaMemcpy(buckets_h, sorted_buckets, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
+  // cudaMemcpy(sizes_h, sorted_indices, sizeof(unsigned) * N, cudaMemcpyDeviceToHost);
+
+  // printf("res:\n");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << buckets_h[i] << "\n";
+  //   // std::cout << sizes_h[i] << "\n";
+  // }
+  // printf("\n");
+  // printf("C test: ");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << Cb_h[i] << ", ";
+  // }
+  // printf("\n");
+  // printf("C ref: ");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << C_d[i] << ", ";
+  //   // std::cout << C_h[i] << ", ";
+  // }
+  // printf("\n");
+
+  return 0;
+}
--- a/icicle/src/mini-course-examples/test.cu
+++ b/icicle/src/mini-course-examples/test.cu
@@ -0,0 +1,199 @@
+#include "fields/id.h"
+// #define FIELD_ID 2
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+#include "kernels.cu"
+
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;
+
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;
+
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
+};
+
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;
+
+// typedef Dummy_Scalar test_t;
+// typedef test_projective test_t;
+typedef test_scalar test_t;
+
+void queryGPUProperties() {
+    int deviceCount = 0;
+    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+    if (error_id != cudaSuccess) {
+        std::cerr << "cudaGetDeviceCount returned " << static_cast<int>(error_id) << " -> " << cudaGetErrorString(error_id) << std::endl;
+        std::cerr << "Result = FAIL" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    if (deviceCount == 0) {
+        std::cout << "There are no available device(s) that support CUDA." << std::endl;
+    } else {
+        std::cout << "Detected " << deviceCount << " CUDA Capable device(s)." << std::endl;
+    }
+
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(dev);
+
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+
+        std::cout << "Device " << dev << ": \"" << deviceProp.name << "\"" << std::endl;
+        std::cout << "  CUDA Capability Major/Minor version number: " << deviceProp.major << "." << deviceProp.minor << std::endl;
+        std::cout << "  Total amount of global memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
+        std::cout << "  Number of multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
+        std::cout << "  Total amount of global memory: " << deviceProp.totalGlobalMem << " bytes" << std::endl;
+        std::cout << "  Total amount of shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
+        std::cout << "  Total amount of shared memory per multiprocessor: " << deviceProp.sharedMemPerMultiprocessor << " bytes" << std::endl;
+        std::cout << "  Total number of registers available per block: " << deviceProp.regsPerBlock << std::endl;
+        std::cout << "  Total number of registers available per multiprocessor: " << deviceProp.regsPerMultiprocessor << std::endl;
+        std::cout << "  Warp size: " << deviceProp.warpSize << std::endl;
+        std::cout << "  Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
+        std::cout << "  Maximum number of threads per multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+        std::cout << "  Maximum sizes of each dimension of a block: " << deviceProp.maxThreadsDim[0] << " x " 
+                  << deviceProp.maxThreadsDim[1] << " x " << deviceProp.maxThreadsDim[2] << std::endl;
+        std::cout << "  Maximum sizes of each dimension of a grid: " << deviceProp.maxGridSize[0] << " x " 
+                  << deviceProp.maxGridSize[1] << " x " << deviceProp.maxGridSize[2] << std::endl;
+        std::cout << "  Clock rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
+        std::cout << "  Memory clock rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
+        std::cout << "  Memory bus width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
+        std::cout << "  Peak memory bandwidth: " 
+                  << 2.0 * deviceProp.memoryClockRate * (deviceProp.memoryBusWidth / 8) / 1.0e6 << " GB/s" << std::endl;
+    }
+}
+
+int main()
+{
+
+  queryGPUProperties();
+
+  int N = 1<<20;
+  // int N = 300;
+
+  test_t* A_h = new test_t[N];
+  test_t* B_h = new test_t[N];
+  test_t* C_h = new test_t[N];
+  test_t* Cb_h = new test_t[N];
+
+  for (int i = 0; i < N; i++)
+  {
+    A_h[i] = i<100? test_t::rand_host() : A_h[i-100];
+    B_h[i] = i<100? test_t::rand_host() : B_h[i-100];
+  }
+  
+  test_t *A_d,*B_d,*C_d;
+  test_t *Cb_d;
+
+
+  cudaMalloc(&A_d, sizeof(test_t) * N);
+  cudaMalloc(&B_d, sizeof(test_t) * N);
+  cudaMalloc(&C_d, sizeof(test_t) * N);
+  cudaMalloc(&Cb_d, sizeof(test_t) * N);
+  
+  cudaMemcpy(A_d, A_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(B_d, B_h, sizeof(test_t) * N, cudaMemcpyHostToDevice);
+
+  // int THREADS = 256;
+  // int BLOCKS = (N + THREADS - 1)/THREADS;
+  // add_elements_kernel<<<BLOCKS, THREADS>>>(A_d, B_d, C_d, N);
+  // cudaDeviceSynchronize();
+  // // printf("cuda error %d\n", cudaGetLastError());
+  // std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  // THREADS = 256;
+  // BLOCKS = (N + THREADS - 1)/THREADS;
+  // bugged_add_elements_kernel<<<BLOCKS, THREADS>>>(A_d, B_d, Cb_d, N);
+  // cudaDeviceSynchronize();
+  // // printf("cuda error %d\n", cudaGetLastError());
+  // std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  // int THREADS = 128;
+  // int BLOCKS = (N/4 + THREADS - 1)/THREADS;
+  // // fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS>>>(A_d, C_d, N);
+  // fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS*4>>>(A_d, C_d, N/4);
+  // cudaDeviceSynchronize();
+  // // printf("cuda error %d\n", cudaGetLastError());
+  // std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  // THREADS = 128;
+  // BLOCKS = (N/4 + THREADS - 1)/THREADS;
+  // // fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS>>>(A_d, C_d, N);
+  // bugged_fake_ntt_kernel<<<BLOCKS, THREADS, sizeof(test_t)*THREADS*4>>>(A_d, Cb_d, N/4);
+  // // bugged_fake_ntt_kernel<<<1, 1, sizeof(test_t)*THREADS*4>>>(A_d, Cb_d, N/4);
+  // cudaDeviceSynchronize();
+  // // printf("cuda error %d\n", cudaGetLastError());
+  // std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  cudaMemcpy(C_h, C_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(Cb_h, Cb_d, sizeof(test_t) * N, cudaMemcpyDeviceToHost);
+
+  // printf("A: ");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << A_h[i] << ", ";
+  // }
+  // printf("\n");
+  // printf("C test: ");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << Cb_h[i] << ", ";
+  // }
+  // printf("\n");
+  // printf("C ref: ");
+  // for (size_t i = 0; i < 8; i++)
+  // {
+  //   std::cout << C_d[i] << ", ";
+  //   // std::cout << C_h[i] << ", ";
+  // }
+  // printf("\n");
+
+  return 0;
+}
--- a/icicle/src/mini-course-examples/transpose_test.cu
+++ b/icicle/src/mini-course-examples/transpose_test.cu
@@ -0,0 +1,123 @@
+#include "fields/id.h"
+#define FIELD_ID 1001
+// #define CURVE_ID 3
+// #include "curves/curve_config.cuh"
+#include "fields/field_config.cuh"
+
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <random>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "fields/field.cuh"
+#include "curves/projective.cuh"
+#include "gpu-utils/device_context.cuh"
+
+#include "kernels.cu"
+
+class Dummy_Scalar
+{
+public:
+  static constexpr unsigned NBITS = 32;
+
+  unsigned x;
+  unsigned p = 10;
+  // unsigned p = 1<<30;
+
+  static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
+
+  friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
+  {
+    os << scalar.x;
+    return os;
+  }
+
+  HOST_DEVICE_INLINE unsigned get_scalar_digit(unsigned digit_num, unsigned digit_width) const
+  {
+    return (x >> (digit_num * digit_width)) & ((1 << digit_width) - 1);
+  }
+
+  friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
+  {
+    return {(p1.x + p2.x) % p1.p};
+  }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) { return (p1.x == p2.x); }
+
+  friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2) { return (p1.x == p2); }
+
+  static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar) { return {scalar.p - scalar.x}; }
+  static HOST_INLINE Dummy_Scalar rand_host()
+  {
+    return {(unsigned)rand() % 10};
+    // return {(unsigned)rand()};
+  }
+};
+
+
+typedef field_config::scalar_t test_scalar;
+// typedef curve_config::scalar_t test_scalar;
+// typedef curve_config::projective_t test_projective;
+// typedef curve_config::affine_t test_affine;
+
+// typedef int test_t;
+// typedef int4 test_t;
+// typedef Dummy_Scalar test_t;
+// typedef test_projective test_t;
+typedef test_scalar test_t;
+
+int main()
+{
+
+  cudaEvent_t start, stop;
+  float kernel_time;
+
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  int N = 1<<11;
+  int N2 = N*N;
+  
+  test_t* arr1_h = new test_t[N2];
+  test_t* arr2_h = new test_t[N2];
+
+  test_t *arr1_d, *arr2_d;
+  
+  cudaMalloc(&arr1_d, N2*sizeof(test_t));
+  cudaMalloc(&arr2_d, N2*sizeof(test_t));
+
+  for (int i = 0; i < N2; i++)
+  {
+    arr1_h[i] = i > 100? arr1_h[i-100] : test_t::rand_host();
+  }
+  
+  cudaMemcpy(arr1_d, arr1_h, sizeof(test_t) * N2, cudaMemcpyHostToDevice);
+
+  int THREADS = 256;
+  int BLOCKS = (N2 + THREADS - 1)/THREADS;
+  
+  //warm up
+  simple_memory_copy<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N2);
+  shmem_transpose<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+
+  cudaEventRecord(start, 0);
+
+  simple_memory_copy<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N2);
+  // naive_transpose_write<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // naive_transpose_read<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  // shmem_transpose<<<BLOCKS, THREADS>>>(arr1_d, arr2_d, N);
+  
+  cudaDeviceSynchronize();
+  std::cout << "cuda err: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
+  cudaEventRecord(stop, 0);
+  cudaStreamSynchronize(0);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+  printf("kernel_time : %.3f ms.\n", kernel_time);
+
+  return 0;
+}
--- a/icicle/src/msm/Makefile
+++ b/icicle/src/msm/Makefile
@@ -1,4 +1,8 @@
+build_msm:
+	mkdir -p work
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I. -I../../include tests/msm_test.cu
+
 test_msm:
 	mkdir -p work
-	nvcc -o work/test_msm -std=c++17 -I. -I../../include tests/msm_test.cu
-	work/test_msm
+	nvcc -o work/test_msm -std=c++17 -arch=sm_80 -I.  -I../../include tests/msm_test.cu
+	work/test_msm
--- a/icicle/src/msm/extern.cu
+++ b/icicle/src/msm/extern.cu
@@ -8,6 +8,17 @@ using namespace field_config;
 #include "utils/utils.h"

 namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref affine_t) of curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, precompute_msm_points_cuda)(
+    affine_t* points, int msm_size, MSMConfig& config, affine_t* output_points)
+  {
+    return precompute_msm_points<affine_t, projective_t>(points, msm_size, config, output_points);
+  }
  /**
   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
   * template parameters (where the curve is given by `-DCURVE` env variable during build):
--- a/icicle/src/msm/extern_g2.cu
+++ b/icicle/src/msm/extern_g2.cu
@@ -8,6 +8,17 @@ using namespace field_config;
 #include "utils/utils.h"

 namespace msm {
+  /**
+   * Extern "C" version of [precompute_msm_points](@ref precompute_msm_points) function with the following values of
+   * template parameters (where the curve is given by `-DCURVE` env variable during build):
+   *  - `A` is the [affine representation](@ref g2_affine_t) of G2 curve points;
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(CURVE, g2_precompute_msm_points_cuda)(
+    g2_affine_t* points, int msm_size, MSMConfig& config, g2_affine_t* output_points)
+  {
+    return precompute_msm_points<g2_affine_t, g2_projective_t>(points, msm_size, config, output_points);
+  }
  /**
   * Extern "C" version of [precompute_msm_bases](@ref precompute_msm_bases) function with the following values of
   * template parameters (where the curve is given by `-DCURVE` env variable during build):
@@ -26,7 +37,6 @@ namespace msm {
    return precompute_msm_bases<g2_affine_t, g2_projective_t>(
      bases, bases_size, precompute_factor, _c, are_bases_on_device, ctx, output_bases);
  }
-
  /**
   * Extern "C" version of [msm](@ref msm) function with the following values of template parameters
   * (where the curve is given by `-DCURVE` env variable during build):
--- a/icicle/src/msm/msm.cu
+++ b/icicle/src/msm/msm.cu
@@ -22,7 +22,6 @@ namespace msm {

 #define MAX_TH 256

-    // #define SIGNED_DIG //WIP
    // #define SSM_SUM  //WIP

    template <typename A, typename P>
@@ -74,10 +73,10 @@ namespace msm {
      unsigned* bucket_sizes_sum,
      unsigned* bucket_sizes,
      unsigned* large_bucket_thread_indices,
-      unsigned num_of_threads)
+      unsigned nof_threads)
    {
      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid >= num_of_threads) { return; }
+      if (tid >= nof_threads) { return; }

      unsigned large_bucket_tid = large_bucket_thread_indices[tid];
      unsigned segment_ind = tid - bucket_sizes_sum[large_bucket_tid] - large_bucket_tid;
@@ -91,12 +90,13 @@ namespace msm {
      P* v_r,
      unsigned block_size,
      unsigned write_stride,
+      unsigned buckets_per_bm,
      unsigned write_phase,
      unsigned step,
-      unsigned num_of_threads)
+      unsigned nof_threads)
    {
      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-      if (tid >= num_of_threads) { return; }
+      if (tid >= nof_threads) return;

      // we need shifted tid because we don't want to be reducing into zero buckets, this allows to skip them.
      // for write_phase==1, the read pattern is different so we don't skip over anything.
@@ -110,7 +110,7 @@ namespace msm {
      const unsigned read_ind = block_size * shifted_block_id + block_tid;
      const unsigned write_ind = jump * shifted_block_id + block_tid;
      const unsigned v_r_key =
-        write_stride ? ((write_ind / write_stride) * 2 + write_phase) * write_stride + write_ind % write_stride
+        write_stride ? ((write_ind / buckets_per_bm) * 2 + write_phase) * write_stride + write_ind % buckets_per_bm
                     : write_ind;
      v_r[v_r_key] = v[read_ind] + v[read_ind + jump];
    }
@@ -325,32 +325,50 @@ namespace msm {
    }

    template <typename P>
-    __global__ void last_pass_kernel(const P* final_buckets, P* final_sums, unsigned num_sums)
+    __global__ void last_pass_kernel(
+      const P* final_buckets,
+      P* final_sums,
+      unsigned nof_sums_per_batch,
+      unsigned batch_size,
+      unsigned nof_bms_per_batch,
+      unsigned orig_c)
    {
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-      if (tid >= num_sums) return;
-      final_sums[tid] = final_buckets[2 * tid + 1];
+      if (tid >= nof_sums_per_batch * batch_size) return;
+      unsigned batch_index = tid / nof_sums_per_batch;
+      unsigned batch_tid = tid % nof_sums_per_batch;
+      unsigned bm_index = batch_tid / orig_c;
+      unsigned bm_tid = batch_tid % orig_c;
+      for (unsigned c = orig_c; c > 1;) {
+        c = (c + 1) >> 1;
+        bm_index <<= 1;
+        if (bm_tid >= c) {
+          bm_index++;
+          bm_tid -= c;
+        }
+      }
+      final_sums[tid] = final_buckets[2 * (batch_index * nof_bms_per_batch + bm_index) + 1];
    }

    // this kernel computes the final result using the double and add algorithm
    // it is done by a single thread
    template <typename P, typename S>
    __global__ void final_accumulation_kernel(
-      const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned nof_empty_bms, unsigned c)
+      const P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_results, unsigned c)
    {
      unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
      if (tid >= nof_msms) return;
      P final_result = P::zero();
      // Note: in some cases accumulation of bm is implemented such that some bms are known to be empty. Therefore
      // skipping them.
-      for (unsigned i = nof_bms - nof_empty_bms; i > 1; i--) {
-        final_result = final_result + final_sums[i - 1 + tid * nof_bms]; // add
-        for (unsigned j = 0; j < c; j++)                                 // double
+      for (unsigned i = nof_results; i > 1; i--) {
+        final_result = final_result + final_sums[i - 1 + tid * nof_results]; // add
+        for (unsigned j = 0; j < c; j++)                                     // double
        {
          final_result = final_result + final_result;
        }
      }
-      final_results[tid] = final_result + final_sums[tid * nof_bms];
+      final_results[tid] = final_result + final_sums[tid * nof_results];
    }

    // this function computes msm using the bucket method
@@ -384,11 +402,6 @@ namespace msm {
        THROW_ICICLE_ERR(
          IcicleError_t::InvalidArgument, "bucket_method_msm: #points must be divisible by single_msm_size*batch_size");
      }
-      if ((precompute_factor & (precompute_factor - 1)) != 0) {
-        THROW_ICICLE_ERR(
-          IcicleError_t::InvalidArgument,
-          "bucket_method_msm: precompute factors that are not powers of 2 currently unsupported");
-      }

      const S* d_scalars;
      S* d_allocated_scalars = nullptr;
@@ -586,7 +599,7 @@ namespace msm {
      CHK_IF_RETURN(cudaFreeAsync(single_bucket_indices, stream));

      // find large buckets
-      unsigned average_bucket_size = single_msm_size / (1 << c);
+      unsigned average_bucket_size = (single_msm_size / (1 << c)) * precompute_factor;
      // how large a bucket must be to qualify as a "large bucket"
      unsigned bucket_th = large_bucket_factor * average_bucket_size;
      unsigned* nof_large_buckets;
@@ -722,7 +735,8 @@ namespace msm {
        CHK_IF_RETURN(cudaMallocAsync(&d_allocated_final_result, sizeof(P) * batch_size, stream));

      // --- Reduction of buckets happens here, after this we'll get a single sum for each bucket module/window ---
-      unsigned nof_empty_bms_per_batch = 0; // for non-triangle accumluation this may be >0
+      unsigned nof_final_results_per_msm =
+        nof_bms_per_msm; // for big-triangle accumluation this is the number of bucket modules
      P* final_results;
      if (is_big_triangle || c == 1) {
        CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
@@ -731,62 +745,90 @@ namespace msm {
        NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
        big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms_in_batch, c);
      } else {
+        // the recursive reduction algorithm works with 2 types of reduction that can run on parallel streams
+        cudaStream_t stream_reduction;
+        cudaEvent_t event_finished_reduction;
+        CHK_IF_RETURN(cudaStreamCreate(&stream_reduction));
+        CHK_IF_RETURN(cudaEventCreateWithFlags(&event_finished_reduction, cudaEventDisableTiming));
+
        unsigned source_bits_count = c;
        unsigned source_windows_count = nof_bms_per_msm;
-        unsigned source_buckets_count = nof_buckets + nof_bms_per_msm;
-        unsigned target_windows_count = 0;
+        unsigned source_buckets_count = nof_buckets + nof_bms_per_msm; // nof buckets per msm including zero buckets
+        unsigned target_windows_count;
        P* source_buckets = buckets;
        buckets = nullptr;
        P* target_buckets;
        P* temp_buckets1;
        P* temp_buckets2;
        for (unsigned i = 0;; i++) {
-          const unsigned target_bits_count = (source_bits_count + 1) >> 1;                 // c/2=8
-          target_windows_count = source_windows_count << 1;                                // nof bms*2 = 32
-          const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
+          const unsigned target_bits_count = (source_bits_count + 1) >> 1;                 // half the bits rounded up
+          target_windows_count = source_windows_count << 1;                                // twice the number of bms
+          const unsigned target_buckets_count = target_windows_count << target_bits_count; // new_bms*2^new_c
+          CHK_IF_RETURN(cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count * batch_size, stream));
          CHK_IF_RETURN(cudaMallocAsync(
-            &target_buckets, sizeof(P) * target_buckets_count * batch_size, stream)); // 32*2^8*2^7 buckets
+            &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size,
+            stream)); // for type1 reduction (strided, bottom window - evens)
          CHK_IF_RETURN(cudaMallocAsync(
-            &temp_buckets1, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
-          CHK_IF_RETURN(cudaMallocAsync(
-            &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size, stream)); // 32*2^8*2^7 buckets
+            &temp_buckets2, sizeof(P) * source_buckets_count / 2 * batch_size,
+            stream)); // for type2 reduction (serial, top window - odds)
+          initialize_buckets_kernel<<<(target_buckets_count * batch_size + 255) / 256, 256>>>(
+            target_buckets, target_buckets_count * batch_size); // initialization is needed for the odd c case

-          if (source_bits_count > 0) {
-            for (unsigned j = 0; j < target_bits_count; j++) {
-              const bool is_first_iter = (j == 0);
-              const bool is_last_iter = (j == target_bits_count - 1);
-              unsigned nof_threads =
-                (((target_buckets_count - target_windows_count) >> 1) << (target_bits_count - 1 - j)) * batch_size;
-              NUM_THREADS = max(1, min(MAX_TH, nof_threads));
-              NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+          for (unsigned j = 0; j < target_bits_count; j++) {
+            const bool is_first_iter = (j == 0);
+            const bool is_second_iter = (j == 1);
+            const bool is_last_iter = (j == target_bits_count - 1);
+            const bool is_odd_c = source_bits_count & 1;
+            unsigned nof_threads =
+              (((source_windows_count << target_bits_count) - source_windows_count) << (target_bits_count - 1 - j)) *
+              batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by nof threads
+                          // per section
+            NUM_THREADS = max(1, min(MAX_TH, nof_threads));
+            NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+            if (!is_odd_c || !is_first_iter) { // skip if c is odd and it's the first iteration
              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-                is_first_iter ? source_buckets : temp_buckets1, is_last_iter ? target_buckets : temp_buckets1,
-                1 << (source_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 0 /*=write_phase*/,
-                (1 << target_bits_count) - 1, nof_threads);
-
-              single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-                is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
-                1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0, 1 /*=write_phase*/,
+                is_first_iter || (is_second_iter && is_odd_c) ? source_buckets : temp_buckets1,
+                is_last_iter ? target_buckets : temp_buckets1, 1 << (source_bits_count - j + (is_odd_c ? 1 : 0)),
+                is_last_iter ? 1 << target_bits_count : 0, 1 << target_bits_count, 0 /*=write_phase*/,
                (1 << target_bits_count) - 1, nof_threads);
            }
+
+            nof_threads = (((source_windows_count << (source_bits_count - target_bits_count)) - source_windows_count)
+                           << (target_bits_count - 1 - j)) *
+                          batch_size; // nof sections to reduce (minus the section that goes to zero buckets) shifted by
+                                      // nof threads per section
+            NUM_THREADS = max(1, min(MAX_TH, nof_threads));
+            NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+            single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream_reduction>>>(
+              is_first_iter ? source_buckets : temp_buckets2, is_last_iter ? target_buckets : temp_buckets2,
+              1 << (target_bits_count - j), is_last_iter ? 1 << target_bits_count : 0,
+              1 << (target_bits_count - (is_odd_c ? 1 : 0)), 1 /*=write_phase*/,
+              (1 << (target_bits_count - (is_odd_c ? 1 : 0))) - 1, nof_threads);
          }
+          CHK_IF_RETURN(cudaEventRecord(event_finished_reduction, stream_reduction));
+          CHK_IF_RETURN(
+            cudaStreamWaitEvent(stream, event_finished_reduction)); // sync streams after every write to target_buckets
          if (target_bits_count == 1) {
            // Note: the reduction ends up with 'target_windows_count' windows per batch element. Some are guaranteed
            // to be empty when target_windows_count>bitsize. for example consider bitsize=253 and c=2. The reduction
            // ends with 254 bms but the most significant one is guaranteed to be zero since the scalars are 253b.
+            // precomputation and odd c can cause additional empty windows.
+            nof_final_results_per_msm = min(c * nof_bms_per_msm, bitsize);
            nof_bms_per_msm = target_windows_count;
-            nof_empty_bms_per_batch = target_windows_count > bitsize ? target_windows_count - bitsize : 0;
-            nof_bms_in_batch = nof_bms_per_msm * batch_size;
+            unsigned total_nof_final_results = nof_final_results_per_msm * batch_size;
+
+            CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * total_nof_final_results, stream));

-            CHK_IF_RETURN(cudaMallocAsync(&final_results, sizeof(P) * nof_bms_in_batch, stream));
            NUM_THREADS = 32;
-            NUM_BLOCKS = (nof_bms_in_batch + NUM_THREADS - 1) / NUM_THREADS;
-            last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(target_buckets, final_results, nof_bms_in_batch);
+            NUM_BLOCKS = (total_nof_final_results + NUM_THREADS - 1) / NUM_THREADS;
+            last_pass_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+              target_buckets, final_results, nof_final_results_per_msm, batch_size, nof_bms_per_msm, c);
            c = 1;
            CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
            CHK_IF_RETURN(cudaFreeAsync(target_buckets, stream));
            CHK_IF_RETURN(cudaFreeAsync(temp_buckets1, stream));
            CHK_IF_RETURN(cudaFreeAsync(temp_buckets2, stream));
+            CHK_IF_RETURN(cudaStreamDestroy(stream_reduction));
            break;
          }
          CHK_IF_RETURN(cudaFreeAsync(source_buckets, stream));
@@ -808,8 +850,8 @@ namespace msm {
      NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
      // launch the double and add kernel, a single thread per batch element
      final_accumulation_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
-        final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size, nof_bms_per_msm,
-        nof_empty_bms_per_batch, c);
+        final_results, are_results_on_device ? final_result : d_allocated_final_result, batch_size,
+        nof_final_results_per_msm, c);
      CHK_IF_RETURN(cudaFreeAsync(final_results, stream));

      if (!are_results_on_device)
@@ -834,13 +876,7 @@ namespace msm {
    const int bitsize = (config.bitsize == 0) ? S::NBITS : config.bitsize;
    cudaStream_t& stream = config.ctx.stream;

-    unsigned c = config.batch_size > 1 ? ((config.c == 0) ? get_optimal_c(msm_size) : config.c) : 16;
-    // reduce c to closest power of two (from below) if not using big_triangle reduction logic
-    // TODO: support arbitrary values of c
-    if (!config.is_big_triangle) {
-      while ((c & (c - 1)) != 0)
-        c &= (c - 1);
-    }
+    unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;

    return CHK_STICKY(bucket_method_msm(
      bitsize, c, scalars, points, config.batch_size, msm_size,
@@ -851,7 +887,33 @@ namespace msm {
  }

  template <typename A, typename P>
-  cudaError_t precompute_msm_bases(
+  cudaError_t precompute_msm_points(A* points, int msm_size, MSMConfig& config, A* output_points)
+  {
+    CHK_INIT_IF_RETURN();
+
+    cudaStream_t& stream = config.ctx.stream;
+    unsigned c = (config.c == 0) ? get_optimal_c(msm_size) : config.c;
+
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      output_points, points, sizeof(A) * config.points_size,
+      config.are_points_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, stream));
+
+    unsigned total_nof_bms = (P::SCALAR_FF_NBITS - 1) / c + 1;
+    unsigned shift = c * ((total_nof_bms - 1) / config.precompute_factor + 1);
+
+    unsigned NUM_THREADS = 1 << 8;
+    unsigned NUM_BLOCKS = (config.points_size + NUM_THREADS - 1) / NUM_THREADS;
+    for (int i = 1; i < config.precompute_factor; i++) {
+      left_shift_kernel<A, P><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(
+        &output_points[(i - 1) * config.points_size], shift, config.points_size,
+        &output_points[i * config.points_size]);
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename A, typename P>
+  [[deprecated("Use precompute_msm_points instead.")]] cudaError_t precompute_msm_bases(
    A* bases,
    int bases_size,
    int precompute_factor,
--- a/icicle/src/msm/tests/msm_test.cu
+++ b/icicle/src/msm/tests/msm_test.cu
@@ -1,3 +1,9 @@
+#include "fields/id.h"
+// #define FIELD_ID 2
+#define CURVE_ID 3
+#include "curves/curve_config.cuh"
+// #include "fields/field_config.cuh"
+
 #include "msm.cu"

 #include <chrono>
@@ -9,7 +15,7 @@
 #include "curves/projective.cuh"
 #include "gpu-utils/device_context.cuh"

-using namespace bn254;
+// using namespace bn254;

 class Dummy_Scalar
 {
@@ -111,20 +117,34 @@ public:

 // switch between dummy and real:

-typedef scalar_t test_scalar;
-typedef projective_t test_projective;
-typedef affine_t test_affine;
+// typedef scalar_t test_scalar;
+// typedef projective_t test_projective;
+// typedef affine_t test_affine;
+
+typedef curve_config::scalar_t test_scalar;
+typedef curve_config::projective_t test_projective;
+typedef curve_config::affine_t test_affine;

 // typedef Dummy_Scalar test_scalar;
 // typedef Dummy_Projective test_projective;
 // typedef Dummy_Projective test_affine;

-int main()
+int main(int argc, char** argv)
 {
-  int batch_size = 1;
+  cudaEvent_t start, stop;
+  float msm_time;
+
+  int msm_log_size = (argc > 1) ? atoi(argv[1]) : 17;
+  int msm_size = 1 << msm_log_size;
+  int batch_size = (argc > 2) ? atoi(argv[2]) : 4;
  //   unsigned msm_size = 1<<21;
-  int msm_size = 12180757;
  int N = batch_size * msm_size;
+  int precomp_factor = (argc > 3) ? atoi(argv[3]) : 1;
+  int user_c = (argc > 4) ? atoi(argv[4]) : 15;
+
+  printf(
+    "running msm curve=%d, 2^%d, batch_size=%d, precomp_factor=%d, c=%d\n", CURVE_ID, msm_log_size, batch_size,
+    precomp_factor, user_c);

  test_scalar* scalars = new test_scalar[N];
  test_affine* points = new test_affine[N];
@@ -136,7 +156,8 @@ int main()

  // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
  // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[2];
+  test_projective res[batch_size];
+  test_projective ref[batch_size];
  // test_projective batched_large_res[batch_size];
  // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
  // fake_point batched_large_res[256];
@@ -149,13 +170,17 @@ int main()

  test_scalar* scalars_d;
  test_affine* points_d;
-  test_projective* large_res_d;
+  test_affine* precomp_points_d;
+  test_projective* res_d;
+  test_projective* ref_d;

-  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
-  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
-  cudaMalloc(&large_res_d, sizeof(test_projective));
-  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
-  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
+  cudaMalloc(&scalars_d, sizeof(test_scalar) * N);
+  cudaMalloc(&points_d, sizeof(test_affine) * N);
+  cudaMalloc(&precomp_points_d, sizeof(test_affine) * N * precomp_factor);
+  cudaMalloc(&res_d, sizeof(test_projective) * batch_size);
+  cudaMalloc(&ref_d, sizeof(test_projective) * batch_size);
+  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(test_affine) * N, cudaMemcpyHostToDevice);

  std::cout << "finished copying" << std::endl;

@@ -170,65 +195,93 @@ int main()
    0,      // mempool
  };
  msm::MSMConfig config = {
-    ctx,   // DeviceContext
-    0,     // points_size
-    1,     // precompute_factor
-    0,     // c
-    0,     // bitsize
-    10,    // large_bucket_factor
-    1,     // batch_size
-    false, // are_scalars_on_device
-    false, // are_scalars_montgomery_form
-    false, // are_points_on_device
-    false, // are_points_montgomery_form
-    true,  // are_results_on_device
-    false, // is_big_triangle
-    true,  // is_async
+    ctx,            // DeviceContext
+    N,              // points_size
+    precomp_factor, // precompute_factor
+    user_c,         // c
+    0,              // bitsize
+    10,             // large_bucket_factor
+    batch_size,     // batch_size
+    false,          // are_scalars_on_device
+    false,          // are_scalars_montgomery_form
+    true,           // are_points_on_device
+    false,          // are_points_montgomery_form
+    true,           // are_results_on_device
+    false,          // is_big_triangle
+    true,           // is_async
+    // false,  // segments_reduction
  };

-  auto begin1 = std::chrono::high_resolution_clock::now();
-  msm::msm<test_scalar, test_affine, test_projective>(scalars, points, msm_size, config, large_res_d);
-  cudaEvent_t msm_end_event;
-  cudaEventCreate(&msm_end_event);
-  auto end1 = std::chrono::high_resolution_clock::now();
-  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
-  printf("No Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  if (precomp_factor > 1)
+    msm::precompute_msm_points<test_affine, test_projective>(points_d, msm_size, config, precomp_points_d);
+
+  // warm up
+  msm::msm<test_scalar, test_affine, test_projective>(
+    scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
+  cudaDeviceSynchronize();
+
+  // auto begin1 = std::chrono::high_resolution_clock::now();
+  cudaEventRecord(start, stream);
+  msm::msm<test_scalar, test_affine, test_projective>(
+    scalars, precomp_factor > 1 ? precomp_points_d : points_d, msm_size, config, res_d);
+  cudaEventRecord(stop, stream);
+  cudaStreamSynchronize(stream);
+  cudaEventElapsedTime(&msm_time, start, stop);
+  // cudaEvent_t msm_end_event;
+  // cudaEventCreate(&msm_end_event);
+  // auto end1 = std::chrono::high_resolution_clock::now();
+  // auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("msm time : %.3f ms.\n", msm_time);
+
+  // reference
+  config.c = 16;
+  config.precompute_factor = 1;
  config.is_big_triangle = true;
-  config.are_results_on_device = false;
-  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
-  std::cout << test_projective::to_affine(large_res[1]) << " " << test_projective::is_on_curve(large_res[1])
-            << std::endl;
-  auto begin = std::chrono::high_resolution_clock::now();
-  msm::msm<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
+  config.batch_size = 1;
+  config.points_size = msm_size;
+  // config.segments_reduction = false;
+  for (int i = 0; i < batch_size; i++) {
+    msm::msm<test_scalar, test_affine, test_projective>(
+      scalars + i * msm_size, points_d + i * msm_size, msm_size, config, ref_d + i);
+  }
+
+  // config.are_results_on_device = false;
+  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+  // auto begin = std::chrono::high_resolution_clock::now();
+  // msm::MSM<test_scalar, test_affine, test_projective>(scalars_d, points_d, msm_size, config, large_res);
  // test_reduce_triangle(scalars);
  // test_reduce_rectangle(scalars);
  // test_reduce_single(scalars);
  // test_reduce_var(scalars);
-  auto end = std::chrono::high_resolution_clock::now();
-  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
-  printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
+  // auto end = std::chrono::high_resolution_clock::now();
+  // auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
+  // printf("Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);

+  // std::cout << test_projective::to_affine(large_res[0]) << std::endl;
+
+  cudaMemcpy(res, res_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(ref, ref_d, sizeof(test_projective) * batch_size, cudaMemcpyDeviceToHost);
+
  //   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);

  // std::cout<<"final results batched large"<<std::endl;
-  // bool success = true;
-  // for (unsigned i = 0; i < batch_size; i++)
-  // {
-  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
-  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
-  //     std::cout<<"good"<<std::endl;
-  //   }
-  //   else{
-  //     std::cout<<"miss"<<std::endl;
-  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
-  //     success = false;
-  //   }
-  // }
-  // if (success){
-  //   std::cout<<"success!"<<std::endl;
-  // }
+  bool success = true;
+  for (unsigned i = 0; i < batch_size; i++) {
+    std::cout << test_projective::to_affine(res[i]) << std::endl;
+    if (test_projective::to_affine(res[i]) == test_projective::to_affine(ref[i])) {
+      std::cout << "good" << std::endl;
+    } else {
+      std::cout << "miss" << std::endl;
+      std::cout << test_projective::to_affine(ref[i]) << std::endl;
+      success = false;
+    }
+  }
+  if (success) { std::cout << "success!" << std::endl; }

  // std::cout<<batched_large_res[0]<<std::endl;
  // std::cout<<batched_large_res[1]<<std::endl;
@@ -239,4 +292,4 @@ int main()
  // std::cout<<pr<<std::endl;

  return 0;
-}
+}
--- a/icicle/src/ntt/kernel_ntt.cu
+++ b/icicle/src/ntt/kernel_ntt.cu
@@ -73,14 +73,14 @@ namespace mxntt {
    // if its index is the smallest number in the group -> do the memory transformation
    //  else --> do nothing

-    const uint32_t size = 1 << log_size;
-    const uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-    const uint32_t idx = columns_batch ? tid / batch_size : tid % size;
-    const uint32_t batch_idx = columns_batch ? tid % batch_size : tid / size;
-    if (tid >= size * batch_size) return;
+    const uint64_t size = 1UL << log_size;
+    const uint64_t tid = uint64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    const uint64_t idx = columns_batch ? tid / batch_size : tid % size;
+    const uint64_t batch_idx = columns_batch ? tid % batch_size : tid / size;
+    if (tid >= uint64_t(size) * batch_size) return;

-    uint32_t next_element = idx;
-    uint32_t group[MAX_GROUP_SIZE];
+    uint64_t next_element = idx;
+    uint64_t group[MAX_GROUP_SIZE];
    group[0] = columns_batch ? next_element * batch_size + batch_idx : next_element + size * batch_idx;

    uint32_t i = 1;
@@ -114,11 +114,13 @@ namespace mxntt {
    bool is_normalize,
    S inverse_N)
  {
-    uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid >= (1 << log_size) * batch_size) return;
-    uint32_t rd = tid;
-    uint32_t wr = (columns_batch ? 0 : ((tid >> log_size) << log_size)) +
-                  generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, dit, fast_tw, rev_type);
+    const uint64_t size = 1UL << log_size;
+    const uint64_t tid = uint64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    if (tid >= uint64_t(size) * batch_size) return;
+
+    uint64_t rd = tid;
+    uint64_t wr = (columns_batch ? 0 : ((tid >> log_size) << log_size)) +
+                  generalized_rev((tid / columns_batch_size) & (size - 1), log_size, dit, fast_tw, rev_type);
    arr_reordered[wr * columns_batch_size + (tid % columns_batch_size)] = is_normalize ? arr[rd] * inverse_N : arr[rd];
  }

@@ -131,14 +133,14 @@ namespace mxntt {
    uint32_t columns_batch_size,
    S* scalar_vec,
    int step,
-    int n_scalars,
+    uint32_t n_scalars,
    uint32_t log_size,
    eRevType rev_type,
    bool fast_tw,
    E* out_vec)
  {
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    if (tid >= size * batch_size) return;
+    uint64_t tid = uint64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    if (tid >= uint64_t(size) * batch_size) return;
    int64_t scalar_id = (tid / columns_batch_size) % size;
    if (rev_type != eRevType::None) {
      // Note: when we multiply an in_vec that is mixed (by DIF (I)NTT), we want to shuffle the
@@ -148,8 +150,7 @@ namespace mxntt {
      // Therefore we use the DIF-digit-reverse to know which element moved to index tid and use it to access the
      // corresponding element in scalars vec.
      const bool dif = rev_type == eRevType::NaturalToMixedRev;
-      scalar_id =
-        generalized_rev((tid / columns_batch_size) & ((1 << log_size) - 1), log_size, !dif, fast_tw, rev_type);
+      scalar_id = generalized_rev((tid / columns_batch_size) & (size - 1), log_size, !dif, fast_tw, rev_type);
    }
    out_vec[tid] = *(scalar_vec + ((scalar_id * step) % n_scalars)) * in_vec[tid];
  }
@@ -523,9 +524,9 @@ namespace mxntt {
  }

  template <typename E, typename S>
-  __global__ void normalize_kernel(E* data, S norm_factor, uint32_t size)
+  __global__ void normalize_kernel(E* data, S norm_factor, uint64_t size)
  {
-    uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
    if (tid >= size) return;
    data[tid] = data[tid] * norm_factor;
  }
@@ -786,7 +787,7 @@ namespace mxntt {
          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
      }
      if (normalize)
-        normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4), (1 << log_size) * batch_size);
+        normalize_kernel<<<batch_size, 16, 0, cuda_stream>>>(out, S::inv_log_size(4), (1UL << log_size) * batch_size);
      return CHK_LAST();
    }

@@ -804,7 +805,7 @@ namespace mxntt {
          columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
      }
      if (normalize)
-        normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5), (1 << log_size) * batch_size);
+        normalize_kernel<<<batch_size, 32, 0, cuda_stream>>>(out, S::inv_log_size(5), (1UL << log_size) * batch_size);
      return CHK_LAST();
    }

@@ -816,7 +817,7 @@ namespace mxntt {
        in, out, external_twiddles, internal_twiddles, basic_twiddles, log_size, tw_log_size,
        columns_batch ? batch_size : 0, columns_batch ? 1 : batch_size, 1, 0, 0, columns_batch, 0, inv, dit, fast_tw);
      if (normalize)
-        normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6), (1 << log_size) * batch_size);
+        normalize_kernel<<<batch_size, 64, 0, cuda_stream>>>(out, S::inv_log_size(6), (1UL << log_size) * batch_size);
      return CHK_LAST();
    }

@@ -844,12 +845,12 @@ namespace mxntt {
          columns_batch, 0, inv, dit, fast_tw);
      }
      if (normalize)
-        normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8), (1 << log_size) * batch_size);
+        normalize_kernel<<<batch_size, 256, 0, cuda_stream>>>(out, S::inv_log_size(8), (1UL << log_size) * batch_size);
      return CHK_LAST();
    }

    // general case:
-    uint32_t nof_blocks = (1 << (log_size - 9)) * (columns_batch ? ((batch_size + 31) / 32) * 32 : batch_size);
+    uint32_t nof_blocks = (1UL << (log_size - 9)) * (columns_batch ? ((batch_size + 31) / 32) * 32 : batch_size);
    if (dit) {
      for (int i = 0; i < 5; i++) {
        uint32_t stage_size = fast_tw ? STAGE_SIZES_HOST_FT[log_size][i] : STAGE_SIZES_HOST[log_size][i];
@@ -900,7 +901,7 @@ namespace mxntt {
    }
    if (normalize)
      normalize_kernel<<<(1 << (log_size - 8)) * batch_size, 256, 0, cuda_stream>>>(
-        out, S::inv_log_size(log_size), (1 << log_size) * batch_size);
+        out, S::inv_log_size(log_size), (1UL << log_size) * batch_size);

    return CHK_LAST();
  }
@@ -926,13 +927,19 @@ namespace mxntt {
  {
    CHK_INIT_IF_RETURN();

-    const int logn = int(log2(ntt_size));
-    const int NOF_BLOCKS = ((1 << logn) * batch_size + 64 - 1) / 64;
-    const int NOF_THREADS = min(64, (1 << logn) * batch_size);
+    const uint64_t total_nof_elements = uint64_t(ntt_size) * batch_size;
+    const uint64_t logn = uint64_t(log2(ntt_size));
+    const uint64_t NOF_BLOCKS_64b = (total_nof_elements + 64 - 1) / 64;
+    const uint32_t NOF_THREADS = total_nof_elements < 64 ? total_nof_elements : 64;
+    // CUDA grid is 32b fields. Assert that I don't need a larger grid.
+    const uint32_t NOF_BLOCKS = NOF_BLOCKS_64b;
+    if (NOF_BLOCKS != NOF_BLOCKS_64b) {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "NTT dimensions (ntt_size, batch) are too large. Unsupported!");
+    }

    bool is_normalize = is_inverse;
    const bool is_on_coset = (coset_gen_index != 0) || arbitrary_coset;
-    const int n_twiddles = 1 << max_logn;
+    const uint32_t n_twiddles = 1U << max_logn;
    // Note: for evaluation on coset, need to reorder the coset too to match the data for element-wise multiplication
    eRevType reverse_input = None, reverse_output = None, reverse_coset = None;
    bool dit = false;
--- a/icicle/src/ntt/thread_ntt.cu
+++ b/icicle/src/ntt/thread_ntt.cu
@@ -196,78 +196,83 @@ public:
  DEVICE_INLINE void
  loadGlobalData(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
    }

    UNROLL
    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride];
+      X[i] = data[s_meta.th_stride * i * data_stride_u64];
    }
  }

  DEVICE_INLINE void loadGlobalDataColumnBatch(
    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

    UNROLL
    for (uint32_t i = 0; i < 8; i++) {
-      X[i] = data[s_meta.th_stride * i * data_stride * batch_size];
+      X[i] = data[s_meta.th_stride * i * data_stride_u64 * batch_size];
    }
  }

  DEVICE_INLINE void
  storeGlobalData(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id;
    }

    UNROLL
    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride] = X[i];
+      data[s_meta.th_stride * i * data_stride_u64] = X[i];
    }
  }

  DEVICE_INLINE void storeGlobalDataColumnBatch(
    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

    UNROLL
    for (uint32_t i = 0; i < 8; i++) {
-      data[s_meta.th_stride * i * data_stride * batch_size] = X[i];
+      data[s_meta.th_stride * i * data_stride_u64 * batch_size] = X[i];
    }
  }

  DEVICE_INLINE void
  loadGlobalData32(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 2 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
    }

    UNROLL
    for (uint32_t j = 0; j < 2; j++) {
      UNROLL
      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride];
+        X[4 * j + i] = data[(8 * i + j) * data_stride_u64];
      }
    }
  }
@@ -275,8 +280,9 @@ public:
  DEVICE_INLINE void loadGlobalData32ColumnBatch(
    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 2 +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

@@ -284,7 +290,7 @@ public:
    for (uint32_t j = 0; j < 2; j++) {
      UNROLL
      for (uint32_t i = 0; i < 4; i++) {
-        X[4 * j + i] = data[(8 * i + j) * data_stride * batch_size];
+        X[4 * j + i] = data[(8 * i + j) * data_stride_u64 * batch_size];
      }
    }
  }
@@ -292,18 +298,19 @@ public:
  DEVICE_INLINE void
  storeGlobalData32(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 2 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 2;
    }

    UNROLL
    for (uint32_t j = 0; j < 2; j++) {
      UNROLL
      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride] = X[4 * j + i];
+        data[(8 * i + j) * data_stride_u64] = X[4 * j + i];
      }
    }
  }
@@ -311,8 +318,9 @@ public:
  DEVICE_INLINE void storeGlobalData32ColumnBatch(
    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 2 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 2 +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

@@ -320,7 +328,7 @@ public:
    for (uint32_t j = 0; j < 2; j++) {
      UNROLL
      for (uint32_t i = 0; i < 4; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[4 * j + i];
+        data[(8 * i + j) * data_stride_u64 * batch_size] = X[4 * j + i];
      }
    }
  }
@@ -328,18 +336,19 @@ public:
  DEVICE_INLINE void
  loadGlobalData16(const E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 4 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
    }

    UNROLL
    for (uint32_t j = 0; j < 4; j++) {
      UNROLL
      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride];
+        X[2 * j + i] = data[(8 * i + j) * data_stride_u64];
      }
    }
  }
@@ -347,8 +356,9 @@ public:
  DEVICE_INLINE void loadGlobalData16ColumnBatch(
    const E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 4 +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

@@ -356,7 +366,7 @@ public:
    for (uint32_t j = 0; j < 4; j++) {
      UNROLL
      for (uint32_t i = 0; i < 2; i++) {
-        X[2 * j + i] = data[(8 * i + j) * data_stride * batch_size];
+        X[2 * j + i] = data[(8 * i + j) * data_stride_u64 * batch_size];
      }
    }
  }
@@ -364,18 +374,19 @@ public:
  DEVICE_INLINE void
  storeGlobalData16(E* data, uint32_t data_stride, uint32_t log_data_stride, bool strided, stage_metadata s_meta)
  {
+    const uint64_t data_stride_u64 = data_stride;
    if (strided) {
-      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-              (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size;
+      data += (s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 4 +
+              (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size;
    } else {
-      data += s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
+      data += (uint64_t)s_meta.ntt_block_id * s_meta.ntt_block_size + s_meta.ntt_inp_id * 4;
    }

    UNROLL
    for (uint32_t j = 0; j < 4; j++) {
      UNROLL
      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride] = X[2 * j + i];
+        data[(8 * i + j) * data_stride_u64] = X[2 * j + i];
      }
    }
  }
@@ -383,8 +394,9 @@ public:
  DEVICE_INLINE void storeGlobalData16ColumnBatch(
    E* data, uint32_t data_stride, uint32_t log_data_stride, stage_metadata s_meta, uint32_t batch_size)
  {
-    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride * s_meta.ntt_inp_id * 4 +
-             (s_meta.ntt_block_id >> log_data_stride) * data_stride * s_meta.ntt_block_size) *
+    const uint64_t data_stride_u64 = data_stride;
+    data += ((s_meta.ntt_block_id & (data_stride - 1)) + data_stride_u64 * s_meta.ntt_inp_id * 4 +
+             (s_meta.ntt_block_id >> log_data_stride) * data_stride_u64 * s_meta.ntt_block_size) *
              batch_size +
            s_meta.batch_id;

@@ -392,7 +404,7 @@ public:
    for (uint32_t j = 0; j < 4; j++) {
      UNROLL
      for (uint32_t i = 0; i < 2; i++) {
-        data[(8 * i + j) * data_stride * batch_size] = X[2 * j + i];
+        data[(8 * i + j) * data_stride_u64 * batch_size] = X[2 * j + i];
      }
    }
  }
--- a/icicle/src/polynomials/cuda_backend/kernels.cuh
+++ b/icicle/src/polynomials/cuda_backend/kernels.cuh
@@ -38,18 +38,6 @@ namespace polynomials {
  }

  /*============================== evaluate ==============================*/
-  template <typename T>
-  __device__ T pow(T base, int exp)
-  {
-    T result = T::one();
-    while (exp > 0) {
-      if (exp & 1) result = result * base;
-      base = base * base;
-      exp >>= 1;
-    }
-    return result;
-  }
-
  // TODO Yuval: implement efficient reduction and support batch evaluation
  template <typename T>
  __global__ void dummy_reduce(const T* arr, int size, T* output)
@@ -67,7 +55,7 @@ namespace polynomials {
  __global__ void evaluate_polynomial_without_reduction(const T* x, const T* coeffs, int num_coeffs, T* tmp)
  {
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < num_coeffs) { tmp[tid] = coeffs[tid] * pow(*x, tid); }
+    if (tid < num_coeffs) { tmp[tid] = coeffs[tid] * T::pow(*x, tid); }
  }

  /*============================== division ==============================*/
--- a/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu
+++ b/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu
@@ -201,20 +201,6 @@ namespace polynomials {
      return {std::move(integrity_pointer), N, m_device_context.device_id};
    }

-    std::tuple<IntegrityPointer<I>, uint64_t, uint64_t>
-    get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
-    {
-      if (nof_evaluations != 0 && nof_evaluations < get_nof_elements()) {
-        THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "get_rou_evaluations_view() can only expand #evals");
-      }
-      transform_to_evaluations(nof_evaluations, is_reversed);
-      auto [evals, N] = get_rou_evaluations();
-      // when reading the pointer, if the counter was modified, the pointer is invalid
-      IntegrityPointer<I> integrity_pointer(evals, m_integrity_counter, *m_integrity_counter);
-      CHK_STICKY(cudaStreamSynchronize(m_device_context.stream));
-      return {std::move(integrity_pointer), N, m_device_context.device_id};
-    }
-
    std::pair<const I*, uint64_t> get_rou_evaluations() override
    {
      const bool is_reversed = this->m_state == State::EvaluationsOnRou_Reversed;
@@ -598,10 +584,9 @@ namespace polynomials {

      const int64_t deg_a = degree(a);
      const int64_t deg_b = degree(b);
-      if (deg_a < deg_b || deg_b < 0) {
+      if (deg_b < 0) {
        THROW_ICICLE_ERR(
-          IcicleError_t::InvalidArgument, "Polynomial division (CUDA backend): numerator degree must be "
-                                          "greater-or-equal to denumerator degree and denumerator must not be zero");
+          IcicleError_t::InvalidArgument, "Polynomial division (CUDA backend): divide by zeropolynomial ");
      }

      // init: Q=0, R=a
@@ -649,10 +634,45 @@ namespace polynomials {
    {
      assert_device_compatability(numerator, out);

-      // TODO Yuval: vanishing polynomial x^n-1 evaluates to zero on ROU
-      // Therefore constant on coset with u as coset generator ((wu)^n-1 = w^n*u^n-1 = u^n-1)
-      // This is true for a coset of size n but if numerator is of size >n, then I need a larger coset and it
-      // doesn't hold. Need to use this fact to optimize division
+      // vanishing polynomial of degree N is the polynomial V(x) such that V(r)=0 for r Nth root-of-unity.
+      // For example for N=4 it vanishes on the group [1,W,W^2,W^3] where W is the 4th root of unity. In that
+      // case V(x)=(x-1)(x-w)(x-w^2)(x-w^3). It can be easily shown that V(x)=x^N-1. This holds since x^N=1 on this
+      // domain (since x is the Nth root of unity).
+
+      // Note that we always represent polynomials with N elements for N a power of two. This is required for NTTs.
+      // In addition we consider deg(P) to be this number of elements N even though the real degree may be lower. for
+      // example 1+x-2x^2 is degree 2 but we store 4 elements and consider it degree 3.
+
+      // when dividing a polynomial  P(x)/V(x) (The vanishing polynomial) the output is of degree deg(P)-deg(V). There
+      // are three cases where V(x) divides P(x) (this is assumed since otherwise the output polynomial does not
+      // exist!):
+      // (1) deg(P)=2*deg(V): in that case deg(P/V)=deg(V)=N. This is an efficient case since on a domain of size N, the
+      // vanishing polynomial evaluates to a constant value.
+      // (2) deg(P)=deg(V)=N: in that case the output is a degree 0 polynomial.
+      // polynomial (i.e. scalar). (3) general case: deg(P)>2*deg(V): in that case deg(P) is a least 4*deg(V) since N is
+      // a power of two. This means that deg(P/V)=deg(P). For example deg(P)=16, deg(V)=4 --> deg(P/V)=12 ceiled to 16.
+
+      // When computing we want to divide P(x)'s evals by V(x)'s evals. Since V(x)=0 on this domain we have to compute
+      // on a coset.
+      // for case (3) we must evaluate V(x) on deg(P) domain size and compute elementwise division on a coset.
+      // case (1) is more efficient because we need N evaluations of V(x) on a coset. Note that V(x)=constant on a coset
+      // of rou. This is because V(wu)=(wu)^N-1=W^N*u^N-1 = 1*u^N-1 (as w^N=1 for w Nth root of unity). case (2) can be
+      // computed like case (1).
+
+      const bool is_case_2N = numerator->get_nof_elements() == 2 * vanishing_poly_degree;
+      const bool is_case_N = numerator->get_nof_elements() == vanishing_poly_degree;
+      if (is_case_2N) {
+        divide_by_vanishing_case_2N(out, numerator, vanishing_poly_degree);
+      } else if (is_case_N) {
+        divide_by_vanishing_case_N(out, numerator, vanishing_poly_degree);
+      } else {
+        divide_by_vanishing_general_case(out, numerator, vanishing_poly_degree);
+      }
+    }
+
+    void divide_by_vanishing_general_case(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // General case: P(x)/V(x) where v is of degree N and p of any degree>N

      // (1) allocate vanishing polynomial in coefficients form
      // TODO Yuval: maybe instead of taking numerator memory and modiyfing it diretcly add a state for evaluations
@@ -688,12 +708,89 @@ namespace polynomials {
      div_element_wise_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
        numerator_coeffs, out_coeffs, N, out_coeffs);

-      // (4) INTT back both a and out
+      // (4) INTT back both numerator and out
      ntt_config.ordering = ntt::Ordering::kMN;
      CHK_STICKY(ntt::ntt(out_coeffs, N, ntt::NTTDir::kInverse, ntt_config, out_coeffs));
      CHK_STICKY(ntt::ntt(numerator_coeffs, N, ntt::NTTDir::kInverse, ntt_config, numerator_coeffs));
    }

+    void divide_by_vanishing_case_2N(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // in that special case the numertaor has 2N elements and output will be N elements
+      if (numerator->get_nof_elements() != 2 * vanishing_poly_degree) {
+        THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "invalid input size. Expecting numerator to be of size 2N");
+      }
+
+      // In the case where deg(P)=2N, I can transform numerator to Reversed-evals -> The second half is
+      // a reversed-coset of size N with coset-gen the 2N-th root of unity.
+      const int N = vanishing_poly_degree;
+      numerator->transform_to_evaluations(2 * N, true /*=reversed*/);
+      // allocate output in coeffs because it will be calculated on a coset but I don't have such a state so will have
+      // to INTT back to coeffs
+      auto numerator_evals_reversed_p = get_context_storage_immutable<I>(numerator);
+      out->allocate(N, State::Coefficients, false /*=set zeros*/);
+      auto out_evals_reversed_p = get_context_storage_mutable<I>(out);
+
+      auto ntt_config = ntt::default_ntt_config<C>(m_device_context);
+      ntt_config.coset_gen = ntt::get_root_of_unity_from_domain<D>((uint64_t)log2(2 * N), ntt_config.ctx);
+      // compute inv(u^N-1);
+      D v_coset_eval = D::inverse(D::pow(ntt_config.coset_gen, N) - D::one());
+
+      const int NOF_THREADS = 128;
+      const int NOF_BLOCKS = (N + NOF_THREADS - 1) / NOF_THREADS;
+      mul_scalar_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+        numerator_evals_reversed_p + N /*second half is the reversed coset*/, v_coset_eval, N, out_evals_reversed_p);
+
+      // INTT back from reversed evals on coset to coeffs
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kRN;
+      ntt::ntt(out_evals_reversed_p, N, ntt::NTTDir::kInverse, ntt_config, out_evals_reversed_p);
+
+      CHK_LAST();
+    }
+
+    void divide_by_vanishing_case_N(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // in that special case the numertaor has N elements and output will be N elements
+      if (numerator->get_nof_elements() != vanishing_poly_degree) {
+        THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "invalid input size. Expecting numerator to be of size N");
+      }
+
+      const int N = vanishing_poly_degree;
+      numerator->transform_to_coefficients(N);
+      auto numerator_evals_reversed_p = get_context_storage_immutable<I>(numerator);
+      out->allocate(N, State::Coefficients, false /*=set zeros*/);
+      auto out_evals_reversed_p = get_context_storage_mutable<I>(out);
+
+      // (1) NTT numerator to coset evals (directly to out)
+      auto ntt_config = ntt::default_ntt_config<C>(m_device_context);
+      ntt_config.coset_gen = ntt::get_root_of_unity_from_domain<D>((uint64_t)log2(2 * N), ntt_config.ctx);
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kNM;
+      ntt::ntt(numerator_evals_reversed_p, N, ntt::NTTDir::kForward, ntt_config, out_evals_reversed_p);
+
+      // (2) divide by constant value (that V(x) evaluates to on the coset)
+      D v_coset_eval = D::inverse(D::pow(ntt_config.coset_gen, N) - D::one());
+
+      const int NOF_THREADS = 128;
+      const int NOF_BLOCKS = (N + NOF_THREADS - 1) / NOF_THREADS;
+      mul_scalar_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+        out_evals_reversed_p, v_coset_eval, N, out_evals_reversed_p);
+
+      // (3) INTT back from coset to coeffs
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kMN;
+      ntt::ntt(out_evals_reversed_p, N, ntt::NTTDir::kInverse, ntt_config, out_evals_reversed_p);
+
+      CHK_LAST();
+    }
+
    // arithmetic with monomials
    void add_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) override
    {
@@ -776,6 +873,72 @@ namespace polynomials {
      }
    }

+    void evaluate_on_rou_domain(PolyContext p, uint64_t domain_log_size, I* evals /*OUT*/) override
+    {
+      const uint64_t poly_size = p->get_nof_elements();
+      const uint64_t domain_size = 1 << domain_log_size;
+      const bool is_evals_on_host = is_host_ptr(evals, m_device_context.device_id);
+
+      I* d_evals = evals;
+      // if evals on host, allocate CUDA memory
+      if (is_evals_on_host) { CHK_STICKY(cudaMallocAsync(&d_evals, domain_size * sizeof(I), m_device_context.stream)); }
+
+      // If domain size is smaller the polynomial size -> transform to evals and copy the evals with stride.
+      // Else, if in coeffs copy coeffs to evals mem and NTT inplace to compute the evals, else INTT to d_evals and back
+      // inplace to larger domain
+      const bool is_domain_size_smaller_than_poly_size = domain_size <= poly_size;
+      if (is_domain_size_smaller_than_poly_size) {
+        // TODO Yuval: in reversed evals, can reverse the first 'domain_size' elements to d_evals instead of
+        // transforming back to evals.
+        p->transform_to_evaluations();
+        const auto stride = poly_size / domain_size;
+        const int NOF_THREADS = 128;
+        const int NOF_BLOCKS = (domain_size + NOF_THREADS - 1) / NOF_THREADS;
+        slice_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+          get_context_storage_immutable<I>(p), d_evals, 0 /*offset*/, stride, domain_size);
+      } else {
+        CHK_STICKY(cudaMemset(d_evals, 0, domain_size * sizeof(I)));
+        auto ntt_config = ntt::default_ntt_config<D>(m_device_context);
+        ntt_config.are_inputs_on_device = true;
+        ntt_config.are_outputs_on_device = true;
+        ntt_config.is_async = true;
+        // TODO Yuval: in evals I can NTT directly to d_evals without changing my state
+        switch (p->get_state()) {
+        case State::Coefficients: {
+          // copy to evals memory and inplace NTT of domain size
+          CHK_STICKY(
+            cudaMemcpy(d_evals, get_context_storage_immutable<I>(p), poly_size * sizeof(I), cudaMemcpyDeviceToDevice));
+          ntt_config.ordering = ntt::Ordering::kNN;
+          ntt::ntt(d_evals, domain_size, ntt::NTTDir::kForward, ntt_config, d_evals);
+        } break;
+        case State::EvaluationsOnRou_Natural:
+        case State::EvaluationsOnRou_Reversed: {
+          const bool is_from_natrual = p->get_state() == State::EvaluationsOnRou_Natural;
+          // INTT to coeffs and back to evals
+          ntt_config.ordering = is_from_natrual ? ntt::Ordering::kNM : ntt::Ordering::kRN;
+          ntt::ntt(get_context_storage_immutable<I>(p), poly_size, ntt::NTTDir::kInverse, ntt_config, d_evals);
+          ntt_config.ordering = is_from_natrual ? ntt::Ordering::kMN : ntt::Ordering::kNN;
+          ntt::ntt(d_evals, poly_size, ntt::NTTDir::kForward, ntt_config, d_evals);
+        } break;
+        default:
+          THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "Invalid state to compute evaluations");
+          break;
+        }
+      }
+
+      // release CUDA memory if allocated
+      if (is_evals_on_host) {
+        CHK_STICKY(
+          cudaMemcpyAsync(evals, d_evals, domain_size * sizeof(I), cudaMemcpyDeviceToHost, m_device_context.stream));
+        CHK_STICKY(cudaFreeAsync(d_evals, m_device_context.stream));
+      }
+
+      // sync since user cannot reuse this stream so need to make sure evals are computed
+      CHK_STICKY(cudaStreamSynchronize(m_device_context.stream)); // sync to make sure return value is copied to host
+
+      CHK_LAST();
+    }
+
    uint64_t copy_coeffs(PolyContext op, C* out_coeffs, uint64_t start_idx, uint64_t end_idx) override
    {
      const uint64_t nof_coeffs = op->get_nof_elements();
@@ -809,18 +972,16 @@ namespace polynomials {
      return host_coeff;
    }

-    std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
+    std::tuple<
+      IntegrityPointer<C>,
+      uint64_t /*size*/
+      ,
+      uint64_t /*device_id*/>
    get_coefficients_view(PolyContext p) override
    {
      return p->get_coefficients_view();
    }

-    std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations, bool is_reversed) override
-    {
-      return p->get_rou_evaluations_view(nof_evaluations, is_reversed);
-    }
-
    inline void assert_device_compatability(PolyContext a, PolyContext b) const
    {
      CUDAPolynomialContext<C, D, I>* a_cuda = static_cast<CUDAPolynomialContext<C, D, I>*>(a.get());
--- a/icicle/src/polynomials/polynomials.cu
+++ b/icicle/src/polynomials/polynomials.cu
@@ -165,6 +165,12 @@ namespace polynomials {
    return m_backend->evaluate_on_domain(m_context, domain, size, evals);
  }

+  template <typename C, typename D, typename I>
+  void Polynomial<C, D, I>::evaluate_on_rou_domain(uint64_t domain_log_size, I* evals /*OUT*/) const
+  {
+    return m_backend->evaluate_on_rou_domain(m_context, domain_log_size, evals);
+  }
+
  template <typename C, typename D, typename I>
  int64_t Polynomial<C, D, I>::degree()
  {
@@ -190,13 +196,6 @@ namespace polynomials {
    return m_backend->get_coefficients_view(m_context);
  }

-  template <typename C, typename D, typename I>
-  std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-  Polynomial<C, D, I>::get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
-  {
-    return m_backend->get_rou_evaluations_view(m_context, nof_evaluations, is_reversed);
-  }
-
  // explicit instantiation for default type (scalar field)
  template class Polynomial<scalar_t>;
  template Polynomial<scalar_t> operator*(const scalar_t& c, const Polynomial<scalar_t>& rhs);
--- a/icicle/src/polynomials/polynomials_c_api.cu
+++ b/icicle/src/polynomials/polynomials_c_api.cu
@@ -200,6 +200,16 @@ namespace polynomials {
    return p->evaluate_on_domain(domain, domain_size, evals);
  }

+  // Evaluates a polynomial on a ROU domain.
+  // p: Pointer to the polynomial instance.
+  // domain_log_size: log size of the domain to evaluate
+  // evals: Output array for the evaluations.
+  void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_rou_domain)(
+    const PolynomialInst* p, uint64_t domain_log_size, scalar_t* evals /*OUT*/)
+  {
+    return p->evaluate_on_rou_domain(domain_log_size, evals);
+  }
+
  // Returns the degree of a polynomial.
  // p: Pointer to the polynomial instance.
  // Returns the degree of the polynomial.
@@ -245,22 +255,6 @@ namespace polynomials {
    return new IntegrityPointer<scalar_t>(std::move(coeffs));
  }

-  // Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
-  // p: Pointer to the polynomial instance.
-  // nof_evals: Number of evaluations.
-  // is_reversed: Whether the evaluations are in reversed order.
-  // size: Output parameter for the size of the view.
-  // device_id: Output parameter for the device ID.
-  // Returns a pointer to an integrity pointer encapsulating the evaluations view.
-  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
-    PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
-  {
-    auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
-    *size = _size;
-    *device_id = _device_id;
-    return new IntegrityPointer<scalar_t>(std::move(rou_evals));
-  }
-
  // Reads the pointer from an integrity pointer.
  // p: Pointer to the integrity pointer.
  // Returns the raw pointer if still valid, otherwise NULL.
--- a/icicle/src/poseidon/Makefile
+++ b/icicle/src/poseidon/Makefile
@@ -1,3 +1,2 @@
-test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
-	nvcc -o test_poseidon -I../../include -DFIELD_ID=2 -DCURVE_ID=2 test.cu
-	./test_poseidon
+test_poseidon : test.cu poseidon.cu kernels.cu constants.cu nvcc - o test_poseidon - I../../ include - DFIELD_ID =
+  2 - DCURVE_ID = 2 test.cu./ test_poseidon
--- a/icicle/src/poseidon/constants.cu
+++ b/icicle/src/poseidon/constants.cu
@@ -98,22 +98,4 @@ namespace poseidon {

    return CHK_LAST();
  }
-
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
-    int arity,
-    int full_rounds_half,
-    int partial_rounds,
-    const scalar_t* constants,
-    device_context::DeviceContext& ctx,
-    PoseidonConstants<scalar_t>* poseidon_constants)
-  {
-    return create_optimized_poseidon_constants<scalar_t>(
-      arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
-  }
-
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon_constants_cuda)(
-    int arity, device_context::DeviceContext& ctx, PoseidonConstants<scalar_t>* constants)
-  {
-    return init_optimized_poseidon_constants<scalar_t>(arity, ctx, constants);
-  }
 } // namespace poseidon
--- a/icicle/src/poseidon/extern.cu
+++ b/icicle/src/poseidon/extern.cu
@@ -0,0 +1,59 @@
+#include "fields/field_config.cuh"
+
+using namespace field_config;
+
+#include "poseidon.cu"
+#include "constants.cu"
+
+#include "gpu-utils/device_context.cuh"
+#include "utils/utils.h"
+
+namespace poseidon {
+  /**
+   * Extern "C" version of [poseidon_hash_cuda] function with the following
+   * value of template parameter (where the field is given by `-DFIELD` env variable during build):
+   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
+   * stand-alone "STARK field";
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
+    scalar_t* input,
+    scalar_t* output,
+    int number_of_states,
+    int arity,
+    const PoseidonConstants<scalar_t>& constants,
+    PoseidonConfig& config)
+  {
+    switch (arity) {
+    case 2:
+      return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
+    case 4:
+      return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
+    case 8:
+      return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
+    case 11:
+      return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
+    default:
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
+    }
+    return CHK_LAST();
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
+    int arity,
+    int full_rounds_half,
+    int partial_rounds,
+    const scalar_t* constants,
+    device_context::DeviceContext& ctx,
+    PoseidonConstants<scalar_t>* poseidon_constants)
+  {
+    return create_optimized_poseidon_constants<scalar_t>(
+      arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon_constants_cuda)(
+    int arity, device_context::DeviceContext& ctx, PoseidonConstants<scalar_t>* constants)
+  {
+    return init_optimized_poseidon_constants<scalar_t>(arity, ctx, constants);
+  }
+} // namespace poseidon
--- a/icicle/src/poseidon/poseidon.cu
+++ b/icicle/src/poseidon/poseidon.cu
@@ -3,7 +3,6 @@
 using namespace field_config;

 #include "poseidon/poseidon.cuh"
-#include "constants.cu"
 #include "kernels.cu"

 namespace poseidon {
@@ -88,27 +87,4 @@ namespace poseidon {
    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
    return CHK_LAST();
  }
-
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
-    scalar_t* input,
-    scalar_t* output,
-    int number_of_states,
-    int arity,
-    const PoseidonConstants<scalar_t>& constants,
-    PoseidonConfig& config)
-  {
-    switch (arity) {
-    case 2:
-      return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
-    case 4:
-      return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
-    case 8:
-      return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
-    case 11:
-      return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
-    default:
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
-    }
-    return CHK_LAST();
-  }
 } // namespace poseidon
--- a/icicle/src/vec_ops/extern.cu
+++ b/icicle/src/vec_ops/extern.cu
@@ -30,6 +30,18 @@ namespace vec_ops {
    return add<scalar_t>(vec_a, vec_b, n, config, result);
  }

+  /**
+   * Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
+   * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
+   * or standalone "STARK field" given by `-DFIELD`).
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, accumulate_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config)
+  {
+    return add<scalar_t>(vec_a, vec_b, n, config, vec_a);
+  }
+
  /**
   * Extern version of [Sub](@ref Sub) function with the template parameter
   * `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
@@ -59,4 +71,10 @@ namespace vec_ops {
  {
    return transpose_matrix<scalar_t>(input, output, row_size, column_size, ctx, on_device, is_async);
  }
+
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, bit_reverse_cuda)(const scalar_t* input, uint64_t n, BitReverseConfig& config, scalar_t* output)
+  {
+    return bit_reverse<scalar_t>(input, n, config, output);
+  }
 } // namespace vec_ops
--- a/icicle/src/vec_ops/extern_extension.cu
+++ b/icicle/src/vec_ops/extern_extension.cu
@@ -29,6 +29,17 @@ namespace vec_ops {
    return add<extension_t>(vec_a, vec_b, n, config, result);
  }

+  /**
+   *  Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
+   * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
+   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
+   */
+  extern "C" cudaError_t
+  CONCAT_EXPAND(FIELD, extension_accumulate_cuda)(extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config)
+  {
+    return add<extension_t>(vec_a, vec_b, n, config, vec_a);
+  }
+
  /**
   * Extern version of [Sub](@ref Sub) function with the template parameter
   * `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
@@ -56,4 +67,10 @@ namespace vec_ops {
  {
    return transpose_matrix<extension_t>(input, output, row_size, column_size, ctx, on_device, is_async);
  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, extension_bit_reverse_cuda)(
+    const extension_t* input, uint64_t n, BitReverseConfig& config, extension_t* output)
+  {
+    return bit_reverse<extension_t>(input, n, config, output);
+  }
 } // namespace vec_ops
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -54,19 +54,47 @@ namespace vec_ops {
      if (tid >= row_size * column_size) return;
      out[(tid % row_size) * column_size + (tid / row_size)] = in[tid];
    }
+
+    template <typename E>
+    __global__ void bit_reverse_kernel(const E* input, uint64_t n, unsigned shift, E* output)
+    {
+      uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
+      // Handling arbitrary vector size
+      if (tid < n) {
+        int reversed_index = __brevll(tid) >> shift;
+        output[reversed_index] = input[tid];
+      }
+    }
+    template <typename E>
+    __global__ void bit_reverse_inplace_kernel(E* input, uint64_t n, unsigned shift)
+    {
+      uint64_t tid = uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
+      // Handling arbitrary vector size
+      if (tid < n) {
+        int reversed_index = __brevll(tid) >> shift;
+        if (reversed_index > tid) {
+          E temp = input[tid];
+          input[tid] = input[reversed_index];
+          input[reversed_index] = temp;
+        }
+      }
+    }
  } // namespace

  template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
-  cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
+  cudaError_t vec_op(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
  {
    CHK_INIT_IF_RETURN();

+    bool is_in_place = vec_a == result;
+
    // Set the grid and block dimensions
    int num_threads = MAX_THREADS_PER_BLOCK;
    int num_blocks = (n + num_threads - 1) / num_threads;

    E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
-    const E *d_vec_a, *d_vec_b;
+    E* d_vec_a;
+    const E* d_vec_b;
    if (!config.is_a_on_device) {
      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
@@ -84,41 +112,49 @@ namespace vec_ops {
    }

    if (!config.is_result_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
+      if (!is_in_place) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
+      } else {
+        d_result = d_vec_a;
+      }
    } else {
-      d_result = result;
+      if (!is_in_place) {
+        d_result = result;
+      } else {
+        d_result = result = d_vec_a;
+      }
    }

    // Call the kernel to perform element-wise operation
    Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);

-    if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
-    if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
-
    if (!config.is_result_on_device) {
      CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
    }

+    if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
+    if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
+
    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));

    return CHK_LAST();
  }

  template <typename E>
-  cudaError_t mul(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
+  cudaError_t mul(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
  {
    return vec_op<E, mul_kernel>(vec_a, vec_b, n, config, result);
  }

  template <typename E>
-  cudaError_t add(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
+  cudaError_t add(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
  {
    return vec_op<E, add_kernel>(vec_a, vec_b, n, config, result);
  }

  template <typename E>
-  cudaError_t sub(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
+  cudaError_t sub(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
  {
    return vec_op<E, sub_kernel>(vec_a, vec_b, n, config, result);
  }
@@ -164,4 +200,39 @@ namespace vec_ops {

    return CHK_LAST();
  }
-} // namespace vec_ops
+
+  template <typename E>
+  cudaError_t bit_reverse(const E* input, uint64_t size, BitReverseConfig& cfg, E* output)
+  {
+    if (size & (size - 1)) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "bit_reverse: size must be a power of 2");
+    if ((input == output) & (cfg.is_input_on_device != cfg.is_output_on_device))
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument, "bit_reverse: equal devices should have same is_on_device parameters");
+
+    E* d_output;
+    if (cfg.is_output_on_device) {
+      d_output = output;
+    } else {
+      // allocate output on gpu
+      CHK_IF_RETURN(cudaMallocAsync(&d_output, sizeof(E) * size, cfg.ctx.stream));
+    }
+
+    uint64_t shift = __builtin_clzll(size) + 1;
+    uint64_t num_blocks = (size + MAX_THREADS_PER_BLOCK - 1) / MAX_THREADS_PER_BLOCK;
+
+    if ((input != output) & cfg.is_input_on_device) {
+      bit_reverse_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(input, size, shift, d_output);
+    } else {
+      if (!cfg.is_input_on_device) {
+        CHK_IF_RETURN(cudaMemcpyAsync(d_output, input, sizeof(E) * size, cudaMemcpyHostToDevice, cfg.ctx.stream));
+      }
+      bit_reverse_inplace_kernel<<<num_blocks, MAX_THREADS_PER_BLOCK, 0, cfg.ctx.stream>>>(d_output, size, shift);
+    }
+    if (!cfg.is_output_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(output, d_output, sizeof(E) * size, cudaMemcpyDeviceToHost, cfg.ctx.stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
+    }
+    if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+    return CHK_LAST();
+  }
+} // namespace vec_ops
--- a/icicle/tests/polynomial_test.cu
+++ b/icicle/tests/polynomial_test.cu
@@ -29,7 +29,7 @@ class PolynomialTest : public ::testing::Test
 {
 public:
  static inline const int MAX_NTT_LOG_SIZE = 24;
-  static inline const bool MEASURE = true;
+  static inline const bool MEASURE = false;

  // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite
  static void SetUpTestSuite()
@@ -54,15 +54,17 @@ public:
    // code that executes after each test
  }

-  static Polynomial_t randomize_polynomial(uint32_t size, bool random = true)
+  static Polynomial_t randomize_polynomial(uint32_t size, bool random = true, bool from_evals = false)
  {
-    auto coeff = std::make_unique<scalar_t[]>(size);
+    auto elements = std::make_unique<scalar_t[]>(size);
    if (random) {
-      random_samples(coeff.get(), size);
+      random_samples(elements.get(), size);
    } else {
-      incremental_values(coeff.get(), size);
+      incremental_values(elements.get(), size);
    }
-    return Polynomial_t::from_coefficients(coeff.get(), size);
+
+    return from_evals ? Polynomial_t::from_rou_evaluations(elements.get(), size)
+                      : Polynomial_t::from_coefficients(elements.get(), size);
  }

  static void random_samples(scalar_t* res, uint32_t count)
@@ -163,6 +165,56 @@ TEST_F(PolynomialTest, evaluationOnDomain)
  ASSERT_EQ((f - g).degree(), -1);
 }

+TEST_F(PolynomialTest, evaluateOnRouDomain)
+{
+  const int logsize = 8;
+  const int size = 1 << logsize;
+  auto f = randomize_polynomial(size);
+  auto g = randomize_polynomial(size, true, true /*from_evals*/);
+
+  // build domain
+  auto test = [&](auto& p, int domain_logsize) {
+    const int domain_size = 1 << domain_logsize;
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
+    scalar_t w = ntt::get_root_of_unity_from_domain<scalar_t>(domain_logsize, ctx);
+    auto domain = std::make_unique<scalar_t[]>(domain_size);
+    domain[0] = scalar_t::one();
+    for (int i = 1; i < domain_size; ++i) {
+      domain[i] = domain[i - 1] * w;
+    }
+
+    // evaluation on domain
+    auto evals_naive = std::make_unique<scalar_t[]>(domain_size);
+    START_TIMER(naive_evals);
+    p.evaluate_on_domain(domain.get(), domain_size, evals_naive.get());
+    END_TIMER(naive_evals, "naive evals took", MEASURE);
+
+    // evaluate on rou domain
+    auto evals_rou_domain = std::make_unique<scalar_t[]>(domain_size);
+    START_TIMER(rou_domain_evals);
+    p.evaluate_on_rou_domain(domain_logsize, evals_rou_domain.get());
+    END_TIMER(rou_domain_evals, "evals on rou domain took", MEASURE);
+
+    ASSERT_EQ(0, memcmp(evals_naive.get(), evals_rou_domain.get(), domain_size * sizeof(scalar_t)));
+  };
+
+  // test f (in coeffs state)
+  test(f, logsize + 2); // evaluate on larger domain
+  test(f, logsize - 3); // evaluate on smaller domain
+  test(f, logsize);     // evaluate on domain with size like poly
+  // test g (in evals state)
+  test(g, logsize + 2); // evaluate on larger domain
+  test(g, logsize - 3); // evaluate on smaller domain
+  test(g, logsize);     // evaluate on domain with size like poly
+
+  // test f*f (in reversed evals state)
+  auto f_squared = f * f;
+  auto new_logsize = logsize + 1;   // f_squared is twice the degree and size of f
+  test(f_squared, new_logsize + 2); // evaluate on larger domain
+  test(f_squared, new_logsize - 3); // evaluate on smaller domain
+  test(f_squared, new_logsize);     // evaluate on domain with size like poly
+}
+
 TEST_F(PolynomialTest, fromEvaluations)
 {
  const int size = 100;
@@ -419,40 +471,16 @@ TEST_F(PolynomialTest, View)
  const int size = 1 << 6;

  auto f = randomize_polynomial(size);
-  {
-    auto [d_coeff, N, device_id] = f.get_coefficients_view();
+  auto [d_coeff, N, device_id] = f.get_coefficients_view();

-    EXPECT_EQ(d_coeff.isValid(), true);
-    auto g = f + f;
-    // expecting the view to remain valid in that case
-    EXPECT_EQ(d_coeff.isValid(), true);
+  EXPECT_EQ(d_coeff.isValid(), true);
+  auto g = f + f;
+  // expecting the view to remain valid in that case
+  EXPECT_EQ(d_coeff.isValid(), true);

-    f += f;
-    // expecting view to be invalidated since f is modified
-    EXPECT_EQ(d_coeff.isValid(), false);
-  }
-
-  auto [d_evals, N, device_id] = f.get_rou_evaluations_view();
-  auto g = Polynomial_t::from_rou_evaluations(d_evals.get(), N);
-  assert_equal(f, g);
-}
-
-TEST_F(PolynomialTest, interpolation)
-{
-  const int size = 1 << 4;
-  const int interpolation_size = 1 << 6;
-
-  const auto x = scalar_t::rand_host();
-
-  auto f = randomize_polynomial(size);
-  auto [evals, N, device_id] = f.get_rou_evaluations_view(interpolation_size); // interpolate from 16 to 64 evaluations
-
-  auto g = Polynomial_t::from_rou_evaluations(evals.get(), N); // note the evals is a view to f
-  const auto fx = f(x);
-  ASSERT_EQ(evals.isValid(), false); // invaidated since f(x) transforms f to coefficients
-
-  const auto gx = g(x); // evaluating g which was constructed from interpolation of f
-  ASSERT_EQ(fx, gx);
+  f += f;
+  // expecting view to be invalidated since f is modified
+  EXPECT_EQ(d_coeff.isValid(), false);
 }

 TEST_F(PolynomialTest, slicing)
--- a/icicle/tests/runner.cu
+++ b/icicle/tests/runner.cu
@@ -3,12 +3,13 @@
 #include <iostream>

 // include list of test files
-// Ensure the device_error_test.cu is last to prevent aborting mid-test run
 #include "field_test.cu"
 #ifdef CURVE_ID
 #include "curve_test.cu"
 #endif
 #include "error_handler_test.cu"
+
+// Ensure the device_error_test.cu is last to prevent aborting mid-test run
 #include "device_error_test.cu"

 int main(int argc, char** argv)
--- a/wrappers/golang/core/msm.go
+++ b/wrappers/golang/core/msm.go
@@ -116,13 +116,13 @@ func MsmCheck(scalars HostOrDeviceSlice, points HostOrDeviceSlice, cfg *MSMConfi
 	return scalars.AsUnsafePointer(), points.AsUnsafePointer(), results.AsUnsafePointer(), size, unsafe.Pointer(cfg)
 }

-func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
+func PrecomputePointsCheck(points HostOrDeviceSlice, cfg *MSMConfig, outputBases DeviceSlice) (unsafe.Pointer, unsafe.Pointer) {
 	outputBasesLength, pointsLength := outputBases.Len(), points.Len()
-	if outputBasesLength != pointsLength*int(precomputeFactor) {
+	if outputBasesLength != pointsLength*int(cfg.PrecomputeFactor) {
 		errorString := fmt.Sprintf(
 			"Precompute factor is probably incorrect: expected %d but got %d",
 			outputBasesLength/pointsLength,
-			precomputeFactor,
+			cfg.PrecomputeFactor,
 		)
 		panic(errorString)
 	}
@@ -131,5 +131,8 @@ func PrecomputeBasesCheck(points HostOrDeviceSlice, precomputeFactor int32, outp
 		points.(DeviceSlice).CheckDevice()
 	}

+	cfg.pointsSize = int32(pointsLength)
+	cfg.arePointsOnDevice = points.IsOnDevice()
+
 	return points.AsUnsafePointer(), outputBases.AsUnsafePointer()
 }
--- a/wrappers/golang/core/poseidon.go
+++ b/wrappers/golang/core/poseidon.go
@@ -0,0 +1,94 @@
+package core
+
+import (
+	"fmt"
+	"unsafe"
+
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+)
+
+type PoseidonConfig struct {
+	/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+	Ctx                cr.DeviceContext
+	areInputsOnDevice  bool
+	areOutputsOnDevice bool
+	///If true, input is considered to be a states vector, holding the preimages in aligned or not aligned format.
+	///Memory under the input pointer will be used for states. If false, fresh states memory will be allocated and input will be copied into it */
+	InputIsAState bool
+	/// If true - input should be already aligned for poseidon permutation.
+	///* Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
+	///* not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D) */
+	Aligned bool
+	///If true, hash results will also be copied in the input pointer in aligned format
+	LoopState bool
+	///Whether to run the Poseidon asynchronously. If set to `true`, the poseidon_hash function will be
+	///non-blocking and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+	///If set to false, the poseidon_hash function will block the current CPU thread. */
+	IsAsync bool
+}
+
+type PoseidonConstants[T any] struct {
+	Arity           int32
+	PartialRounds   int32
+	FullRoundsHalf  int32
+	RoundConstants  unsafe.Pointer
+	MdsMatrix       unsafe.Pointer
+	NonSparseMatrix unsafe.Pointer
+	SparseMatrices  unsafe.Pointer
+	DomainTag       T
+}
+
+func GetDefaultPoseidonConfig() PoseidonConfig {
+	ctx, _ := cr.GetDefaultDeviceContext()
+	return PoseidonConfig{
+		ctx,   // Ctx
+		false, // areInputsOnDevice
+		false, // areOutputsOnDevice
+		false, // inputIsAState
+		false, // aligned
+		false, // loopState
+		false, // IsAsync
+	}
+}
+
+func PoseidonCheck[T any](input, output HostOrDeviceSlice, cfg *PoseidonConfig, constants *PoseidonConstants[T], numberOfStates int) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer) {
+	inputLen, outputLen := input.Len(), output.Len()
+	arity := int(constants.Arity)
+	expectedInputLen := arity * numberOfStates
+	if cfg.InputIsAState {
+		expectedInputLen += numberOfStates
+	}
+
+	if inputLen != expectedInputLen {
+		errorString := fmt.Sprintf(
+			"input is not the right length for the given parameters: %d, should be: %d",
+			inputLen,
+			arity*numberOfStates,
+		)
+		panic(errorString)
+	}
+
+	if outputLen != numberOfStates {
+		errorString := fmt.Sprintf(
+			"output is not the right length for the given parameters: %d, should be: %d",
+			outputLen,
+			numberOfStates,
+		)
+		panic(errorString)
+	}
+	cfg.areInputsOnDevice = input.IsOnDevice()
+	cfg.areOutputsOnDevice = output.IsOnDevice()
+
+	if input.IsOnDevice() {
+		input.(DeviceSlice).CheckDevice()
+
+	}
+
+	if output.IsOnDevice() {
+		output.(DeviceSlice).CheckDevice()
+	}
+
+	cfgPointer := unsafe.Pointer(cfg)
+
+	return input.AsUnsafePointer(), output.AsUnsafePointer(), cfgPointer
+}
--- a/wrappers/golang/curves/bls12377/g2/include/msm.h
+++ b/wrappers/golang/curves/bls12377/g2/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_377_g2_msm_cuda(const scalar_t* scalars,const  g2_affine_t* points, int count, MSMConfig* config, g2_projective_t* out);
 cudaError_t bls12_377_g2_precompute_msm_bases_cuda(g2_affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, g2_affine_t* out);
+cudaError_t bls12_377_g2_precompute_msm_points_cuda(g2_affine_t* points, int msm_size, MSMConfig* config, g2_affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12377/g2/msm.go
+++ b/wrappers/golang/curves/bls12377/g2/msm.go
@@ -5,10 +5,9 @@ package g2
 import "C"

 import (
-	"unsafe"
-
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"unsafe"
 )

 func G2GetDefaultMSMConfig() core.MSMConfig {
@@ -29,8 +28,13 @@ func G2Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *c
 	return err
 }

+// Deprecated: G2PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// G2PrecomputePoints should be used instead.
 func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := G2GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.g2_affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -44,3 +48,16 @@ func G2PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func G2PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.g2_affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.g2_affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_377_g2_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12377/msm/include/msm.h
+++ b/wrappers/golang/curves/bls12377/msm/include/msm.h
@@ -16,6 +16,7 @@ typedef struct DeviceContext DeviceContext;

 cudaError_t bls12_377_msm_cuda(const scalar_t* scalars,const  affine_t* points, int count, MSMConfig* config, projective_t* out);
 cudaError_t bls12_377_precompute_msm_bases_cuda(affine_t* points, int count, int precompute_factor, int _c, bool bases_on_device, DeviceContext* ctx, affine_t* out);
+cudaError_t bls12_377_precompute_msm_points_cuda(affine_t* points, int msm_size, MSMConfig* config, affine_t* out);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12377/msm/msm.go
+++ b/wrappers/golang/curves/bls12377/msm/msm.go
@@ -5,10 +5,9 @@ package msm
 import "C"

 import (
-	"unsafe"
-
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"unsafe"
 )

 func GetDefaultMSMConfig() core.MSMConfig {
@@ -29,8 +28,13 @@ func Msm(scalars core.HostOrDeviceSlice, points core.HostOrDeviceSlice, cfg *cor
 	return err
 }

+// Deprecated: PrecomputeBases exists for backward compatibility.
+// It may cause issues if an MSM with a different `c` value is used with precomputed points and it will be removed in a future version.
+// PrecomputePoints should be used instead.
 func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c int32, ctx *cr.DeviceContext, outputBases core.DeviceSlice) cr.CudaError {
-	pointsPointer, outputBasesPointer := core.PrecomputeBasesCheck(points, precomputeFactor, outputBases)
+	cfg := GetDefaultMSMConfig()
+	cfg.PrecomputeFactor = precomputeFactor
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, &cfg, outputBases)

 	cPoints := (*C.affine_t)(pointsPointer)
 	cPointsLen := (C.int)(points.Len())
@@ -44,3 +48,16 @@ func PrecomputeBases(points core.HostOrDeviceSlice, precomputeFactor int32, c in
 	err := (cr.CudaError)(__ret)
 	return err
 }
+
+func PrecomputePoints(points core.HostOrDeviceSlice, msmSize int, cfg *core.MSMConfig, outputBases core.DeviceSlice) cr.CudaError {
+	pointsPointer, outputBasesPointer := core.PrecomputePointsCheck(points, cfg, outputBases)
+
+	cPoints := (*C.affine_t)(pointsPointer)
+	cMsmSize := (C.int)(msmSize)
+	cCfg := (*C.MSMConfig)(unsafe.Pointer(cfg))
+	cOutputBases := (*C.affine_t)(outputBasesPointer)
+
+	__ret := C.bls12_377_precompute_msm_points_cuda(cPoints, cMsmSize, cCfg, cOutputBases)
+	err := (cr.CudaError)(__ret)
+	return err
+}
--- a/wrappers/golang/curves/bls12377/ntt/ntt.go
+++ b/wrappers/golang/curves/bls12377/ntt/ntt.go
@@ -5,13 +5,15 @@ package ntt
 import "C"

 import (
-	"unsafe"
-
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
 )

+import (
+	"unsafe"
+)
+
 func Ntt[T any](scalars core.HostOrDeviceSlice, dir core.NTTDir, cfg *core.NTTConfig[T], results core.HostOrDeviceSlice) core.IcicleError {
 	scalarsPointer, resultsPointer, size, cfgPointer := core.NttCheck[T](scalars, cfg, results)

--- a/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
@@ -0,0 +1,25 @@
+#include <cuda_runtime.h>
+#include <stdbool.h>
+
+#ifndef _BLS12_377_POSEIDON_H
+#define _BLS12_377_POSEIDON_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct scalar_t scalar_t;
+typedef struct PoseidonConfig PoseidonConfig;
+typedef struct DeviceContext DeviceContext;
+typedef struct PoseidonConstants PoseidonConstants;
+
+
+cudaError_t bls12_377_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
+cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
+cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/wrappers/golang/curves/bls12377/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bls12377/poseidon/poseidon.go
@@ -0,0 +1,57 @@
+package poseidon
+
+// #cgo CFLAGS: -I./include/
+// #include "poseidon.h"
+import "C"
+
+import (
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+)
+
+func GetDefaultPoseidonConfig() core.PoseidonConfig {
+	return core.GetDefaultPoseidonConfig()
+}
+
+func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
+	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+
+	cScalars := (*C.scalar_t)(scalarsPointer)
+	cResults := (*C.scalar_t)(resultsPointer)
+	cNumberOfStates := (C.int)(numberOfStates)
+	cArity := (C.int)(constants.Arity)
+	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
+	cCfg := (*C.PoseidonConfig)(cfgPointer)
+
+	__ret := C.bls12_377_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+
+	err := (cr.CudaError)(__ret)
+	return core.FromCudaError(err)
+}
+
+func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
+
+	cArity := (C.int)(arity)
+	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
+	cPartialRounds := (C.int)(partialRounds)
+	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
+	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
+
+	__ret := C.bls12_377_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+	err := (cr.CudaError)(__ret)
+	return core.FromCudaError(err)
+}
+
+func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
+
+	cArity := (C.int)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
+	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
+
+	__ret := C.bls12_377_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
+	err := (cr.CudaError)(__ret)
+	return core.FromCudaError(err)
+}
--- a/wrappers/golang/curves/bls12377/scalar_field.go
+++ b/wrappers/golang/curves/bls12377/scalar_field.go
@@ -6,10 +6,9 @@ import "C"
 import (
 	"encoding/binary"
 	"fmt"
-	"unsafe"
-
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"unsafe"
 )

 const (
--- a/wrappers/golang/curves/bls12377/tests/base_field_test.go
+++ b/wrappers/golang/curves/bls12377/tests/base_field_test.go
@@ -1,11 +1,10 @@
 package tests

 import (
-	"testing"
-
 	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/test_helpers"
 	"github.com/stretchr/testify/assert"
+	"testing"
 )

 const (
--- a/wrappers/golang/curves/bls12377/tests/curve_test.go
+++ b/wrappers/golang/curves/bls12377/tests/curve_test.go
@@ -1,11 +1,10 @@
 package tests

 import (
-	"testing"
-
 	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/test_helpers"
 	"github.com/stretchr/testify/assert"
+	"testing"
 )

 func TestAffineZero(t *testing.T) {
--- a/wrappers/golang/curves/bls12377/tests/g2_curve_test.go
+++ b/wrappers/golang/curves/bls12377/tests/g2_curve_test.go
@@ -1,11 +1,10 @@
 package tests

 import (
-	"testing"
-
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/g2"
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/test_helpers"
 	"github.com/stretchr/testify/assert"
+	"testing"
 )

 func TestG2AffineZero(t *testing.T) {
--- a/wrappers/golang/curves/bls12377/tests/g2_g2base_field_test.go
+++ b/wrappers/golang/curves/bls12377/tests/g2_g2base_field_test.go
@@ -1,11 +1,10 @@
 package tests

 import (
-	"testing"
-
 	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/g2"
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/test_helpers"
 	"github.com/stretchr/testify/assert"
+	"testing"
 )

 const (
--- a/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
@@ -210,9 +210,11 @@ func TestMSMG2Batch(t *testing.T) {
 	}
 }

-func TestPrecomputeBaseG2(t *testing.T) {
+func TestPrecomputePointsG2(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -222,20 +224,18 @@ func TestPrecomputeBaseG2(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = g2.G2PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = g2.G2PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p g2.G2Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = g2.G2Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[g2.G2Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/wrappers/golang/curves/bls12377/tests/msm_test.go
+++ b/wrappers/golang/curves/bls12377/tests/msm_test.go
@@ -170,9 +170,11 @@ func TestMSMBatch(t *testing.T) {
 	}
 }

-func TestPrecomputeBase(t *testing.T) {
+func TestPrecomputePoints(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	const precomputeFactor = 8
+	cfg.PrecomputeFactor = precomputeFactor
+
 	for _, power := range []int{10, 16} {
 		for _, batchSize := range []int{1, 3, 16} {
 			size := 1 << power
@@ -182,20 +184,18 @@ func TestPrecomputeBase(t *testing.T) {

 			var precomputeOut core.DeviceSlice
 			_, e := precomputeOut.Malloc(points[0].Size()*points.Len()*int(precomputeFactor), points[0].Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for PrecomputeBases results failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for PrecomputeBases results failed")

-			e = msm.PrecomputeBases(points, precomputeFactor, 0, &cfg.Ctx, precomputeOut)
-			assert.Equal(t, e, cr.CudaSuccess, "PrecomputeBases failed")
+			e = msm.PrecomputePoints(points, size, &cfg, precomputeOut)
+			assert.Equal(t, cr.CudaSuccess, e, "PrecomputeBases failed")

 			var p icicleBls12_377.Projective
 			var out core.DeviceSlice
 			_, e = out.Malloc(batchSize*p.Size(), p.Size())
-			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-
-			cfg.PrecomputeFactor = precomputeFactor
+			assert.Equal(t, cr.CudaSuccess, e, "Allocating bytes on device for Projective results failed")

 			e = msm.Msm(scalars, precomputeOut, &cfg, out)
-			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+			assert.Equal(t, cr.CudaSuccess, e, "Msm failed")
 			outHost := make(core.HostSlice[icicleBls12_377.Projective], batchSize)
 			outHost.CopyFromDevice(&out)
 			out.Free()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
hadaringonyama	6b9732e67e	session 4 start	2024-07-17 12:06:13 +03:00
hadaringonyama	3d8a6fbca2	session 3 start	2024-07-10 14:37:16 +03:00
hadaringonyama	dadc5fcc24	session 3 start	2024-07-10 10:50:53 +03:00
hadaringonyama	8550aeddd3	session2 start	2024-07-03 16:37:57 +03:00
hadaringonyama	1e44f59b37	session2 start	2024-07-03 12:09:10 +03:00
hadaringonyama	c4105aa8d5	memory kernel	2024-07-01 14:45:33 +03:00
hadaringonyama	b754e66153	lineinfo	2024-06-30 13:20:26 +03:00
hadaringonyama	a0fa0c66b6	adding performance example	2024-06-27 21:56:58 +03:00
hadaringonyama	0fe27bd480	start	2024-06-26 11:52:47 +03:00
hadaringonyama	0c9ae9f4b4	start	2024-06-26 11:38:06 +03:00
hadaringonyama	714ea4a213	start	2024-06-26 11:15:33 +03:00
hadaringonyama	c6a4c2a6a7	start	2024-06-26 11:15:00 +03:00
hadaringonyama	e1ac80e8ce	first commit	2024-06-24 20:58:52 +03:00
HadarIngonyama	7831f7bd0f	Msm/update docs (#545 ) Updates MSM documentation --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>	2024-06-19 11:38:24 +03:00
Otsar	de25b6e203	Added v2 paper (#544 )	2024-06-18 15:19:49 +03:00
Otsar	69383e6c73	Update docusaurus.config.js bold, added emoji	2024-06-18 15:04:26 +03:00
Otsar	c305aade5d	Update overview.md	2024-06-18 15:00:24 +03:00
Otsar	87bdf04a19	Update docusaurus.config.js	2024-06-18 13:05:14 +03:00
Otsar	e152977843	Update overview.md Added v2 paper	2024-06-18 12:23:03 +03:00
release-bot	3d01c09c82	Bump rust crates' version icicle-babybear@2.5.0 icicle-bls12-377@2.5.0 icicle-bls12-381@2.5.0 icicle-bn254@2.5.0 icicle-bw6-761@2.5.0 icicle-core@2.5.0 icicle-cuda-runtime@2.5.0 icicle-grumpkin@2.5.0 icicle-hash@2.5.0 icicle-stark252@2.5.0 Generated by cargo-workspaces	2024-06-17 13:17:24 +00:00
HadarIngonyama	8936d9c800	MSM - supporting all window sizes (#534 ) This PR enables using MSM with any value of c. Note: default c isn't necessarily optimal, the user is expected to choose c and the precomputation factor that give the best results for the relevant case. --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-06-17 15:57:24 +03:00
Jeremy Felder	af9ec76506	Fix link and correct path for running test deploy workflow (#542 ) ## Describe the changes Fixes a link issue in docs preventing deployment	2024-06-17 15:44:15 +03:00
Otsar	cdd99d2a46	recreated images for poseidon.md (#541 ) Fixed 3 images shown in low quality - i have recreated the 3 images - please check me to see that i have not made a mistake	2024-06-17 12:16:26 +03:00
Jeremy Felder	3e551762c0	Updated alt text for images and fixed broken link	2024-06-16 18:35:42 +03:00
Otsar	37c22e81e7	Update poseidon.md fixed - added arrows	2024-06-16 15:01:12 +03:00
Otsar	69e73ffa3e	Update poseidon.md Fixed image quality	2024-06-16 11:42:46 +03:00
cangqiaoyuzhuo	512e1ca372	chore: remove repeat word (#540 ) ## Describe the changes remove repeat word ## Linked Issues Resolves # Signed-off-by: cangqiaoyuzhuo <850072022@qq.com>	2024-06-13 11:53:22 +03:00
VitaliiH	e19a869691	accumulate stwo (#535 ) adds in-place vector addition and api as accumulate	2024-06-10 12:24:58 +02:00
yshekel	9c55d888ae	workflow curve fix (#536 )	2024-06-09 11:18:23 +03:00
release-bot	18f51de56c	Bump rust crates' version icicle-babybear@2.4.0 icicle-bls12-377@2.4.0 icicle-bls12-381@2.4.0 icicle-bn254@2.4.0 icicle-bw6-761@2.4.0 icicle-core@2.4.0 icicle-cuda-runtime@2.4.0 icicle-grumpkin@2.4.0 icicle-hash@2.4.0 icicle-stark252@2.4.0 Generated by cargo-workspaces	2024-06-06 14:42:36 +00:00
yshekel	33b1f3c794	perf: projective scalar multiplication use dbl() rather than + (#530 )	2024-06-05 20:35:21 +03:00
Karthik Inbasekar	3a276ef23c	added example cpp: example_commit_with_device_memory_view() (#532 ) ## Describe the changes This PR... Added an example for simple commit that makes use of polynomial views. Output attached ``` Example: a) commit with Polynomial views [(f1+f2)^2 + (f1-f2)^2 ]_1 = [4 (f1^2+ f_2^2)]_1 Example: b) commit with Polynomial views [(f1+f2)^2 - (f1-f2)^2 ]_1 = [4 f1 f_2]_1 Setup: Generating mock SRS Setup: SRS of length 1025 generated and loaded to device. Took: 19557 milliseconds Setup: Generating polys (on device) f1,f2 of log degree 10 Setup: Gen poly done. Took: 7 milliseconds Computing constraints..start Computing constraints..done. Took: 0 milliseconds Computing Commitments with poly view Commitments done. Took: 29 milliseconds commitment [(f1+f2)^2 + (f1-f2)^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [[2 (f_1^2+f_2^2]_1: [x: 0x1e35d81da10e5026dacdd907d6ed0dde673de449ff8c0137ec6acbfd6b1dfe1b, y: 0x21fc051415af35a781f84ebcf999313d489ae38ebefa561c9de2fb0b11091502] commitment [(f1+f2)^2 - (f1-f2)^2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] commitment [4 f_1f_2]_1: [x: 0x21e9dc012aef8d95107fbfe63f455d4345b9b21e37bcb0a49043b1066e211ffa, y: 0x2d6a3b2f1be1042a17c58ff595134b9cceb71d1af4f1c67a5696859cd4bafae3] ``` ## Linked Issues Resolves #	2024-06-05 18:25:12 +03:00
nonam3e	8e62bde16d	bit reverse (#528 ) This PR adds bit reverse operation support to icicle	2024-06-02 16:37:58 +07:00
Jeremy Felder	417ca77f61	precompute bug fix (#529 ) This PR fixes 2 things: 1. Removes the assertion regarding the precompute factor needing to be a power of 2. There is no such requirement and it works just fine for other values too. 2. Fixes the average bucket size for the large buckets threshold - it depends on the precompute factor.	2024-05-29 13:59:48 +03:00
hadaringonyama	8911a32135	precompute bug fix	2024-05-28 12:48:48 +03:00
release-bot	c6f6e61d60	Bump rust crates' version icicle-babybear@2.3.1 icicle-bls12-377@2.3.1 icicle-bls12-381@2.3.1 icicle-bn254@2.3.1 icicle-bw6-761@2.3.1 icicle-core@2.3.1 icicle-cuda-runtime@2.3.1 icicle-grumpkin@2.3.1 icicle-hash@2.3.1 icicle-stark252@2.3.1 Generated by cargo-workspaces	2024-05-20 13:43:32 +00:00
yshekel	4e3aa63d2f	fix: ntt mixed-radix bug for large ntts (>4G elements) (#523 ) in some cases 32b values would wrap around and cause invalid accesses to wrong elements and memory addresses	2024-05-20 16:42:44 +03:00
Leon Hibnik	db298aefc1	[HOTFIX] rust msm benchmarks (#521 ) ## Describe the changes removes unused host to device copy, adds minimum limit to run MSM benchmarks	2024-05-20 13:51:53 +03:00
yshekel	19a9b76d64	fix: cmake set_gpu_env() and windows build (#520 )	2024-05-20 13:05:45 +03:00
Jeremy Felder	1e343f17a3	Allow overriding compiler's chosen GPU arch via cmake (#518 ) ## Describe the changes This PR modifies icicle/cmake/Common.cmake to set CMAKE_CUDA_ARCHITECTURES to ${CUDA_ARCH} if the user defines the arch, to set CMAKE_CUDA_ARCHITECTURES to native if the cmake version is greater than or equal to 3.24.0. This change has been successfully tested with cmake 3.22.0 and 3.25.2. ## Linked Issues Resolves #167.	2024-05-19 16:03:15 +03:00
liuhao230	cfea6ebb3b	Merge branch 'ingonyama-zk:main' into main	2024-05-17 14:24:02 +08:00
release-bot	76a82bf88e	Bump rust crates' version icicle-babybear@2.3.0 icicle-bls12-377@2.3.0 icicle-bls12-381@2.3.0 icicle-bn254@2.3.0 icicle-bw6-761@2.3.0 icicle-core@2.3.0 icicle-cuda-runtime@2.3.0 icicle-grumpkin@2.3.0 icicle-hash@2.3.0 icicle-stark252@2.3.0 Generated by cargo-workspaces	2024-05-17 04:42:17 +00:00
Vlad	b8310d577e	Feat/vlad/poseidon go binding (#513 )	2024-05-17 07:20:15 +03:00
liu	49c7fa4b28	fix: add the PARENT_SCOPE Signed-off-by: liu <liuhao2206@buaa.edu.cn>	2024-05-17 10:45:09 +08:00
Stas	02059fcfaa	Stas/best-practice-ntt (#517 ) ## Describe the changes Icicle examples: Concurrent Data Transfer and NTT Computation This PR introduces a Best Practice series of examples in c++. Specifically, the example shows how to concurrently transfer data to/from device and execute NTT ## Linked Issues Resolves #	2024-05-16 23:51:49 +03:00
nonam3e	4496520a10	golang examples init (#516 ) ## Describe the changes This PR adds golang examples --------- Co-authored-by: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>	2024-05-16 19:40:13 +03:00
liu	88a6966a4b	Allow overriding compiler's chosen GPU arch via cmake	2024-05-15 22:40:51 +08:00
yshekel	9c1afe8a44	Polynomial API views replaced by evaluation on rou domain (#514 ) - removed poly API to access view of evaluations. This is a problematic API since it cannot handle small domains and for large domains requires the polynomial to use more memory than need to. - added evaluate_on_rou_domain() API instead that supports any domain size (powers of two size). - the new API can compute to HOST or DEVICE memory - Rust wrapper for evaluate_on_rou_domain() - updated documentation: overview and Rust wrappers - faster division by vanishing poly for common case where numerator is 2N and vanishing poly is of degree N. - allow division a/b where deg(a)<deg(b) instead of throwing an error.	2024-05-15 14:06:23 +03:00
Jeremy Felder	972b924bc0	Update CI to run on some non-code changes (#515 ) ## Describe the changes This PR: - Updates the CI to run on CI workflow file changes - Updates examples CI to run on examples file changes	2024-05-15 13:17:13 +03:00
sukrucildirr	230a1da512	Fix broken link (#512 ) ## Describe the changes There was a broken link is linked to ZKContainer word. ## Linked Issues Resolves #	2024-05-14 08:36:39 +07:00