Added examples for rust and c++

2026-01-09 13:07:59 -05:00 · 2024-01-07 13:40:57 +02:00
parent 709009a96a
commit 79b86e0edf
35 changed files with 1654 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ ICICLE is a CUDA implementation of general functions widely used in ZKP. ICICLE
 ## Build and usage

 > [!WARNING]
-> Stable versions of icicle and its bindings are released under versioned tags.
+> Stable versions of ICICLE and its bindings are released under versioned tags.
 > We will try our best to keep latest main stable as well but it should be considered unstable and may break at any time.


@@ -74,7 +74,7 @@ ICICLE has three build systems.

 ICICLE core always needs to be built as part of the other build systems as it contains the core ICICLE primitives implemented in CUDA. Reference these guides for the different build systems, [ICICLE core guide][ICICLE-CORE-README], [ICICLE Rust guide][ICICLE-RUST-README] and [ICICLE Golang guide][ICICLE-GO-README].

-## Docker
+### Docker

 We offer a simple Docker container so you can simply run ICICLE without setting everything up locally.

@@ -83,9 +83,13 @@ docker build -t <name_of_your_choice> .
 docker run --gpus all -it <name_of_your_choice> /bin/bash
 ```

+## Examples
+
+We've provided a few [examples](./examples/) using ICICLE with c++ and our rust bindings.
+
 ## Contributions

-Join our [Discord Server][DISCORD] and find us on the icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.
+Join our [Discord Server][DISCORD] and find us on the #icicle channel. We will be happy to work together to support your use case and talk features, bugs and design.

 ### Development Contributions

@@ -95,7 +99,7 @@ If you are changing code, please make sure to change your [git hooks path][HOOKS
 git config core.hooksPath ./scripts/hooks
 ```

-In case `clang-format` is missing on your system, you can install it  using the following command:
+In case `clang-format` is missing on your system, you can install it using the following command:

 ```sh
 sudo apt install clang-format
--- a/examples/ZKContainer.md
+++ b/examples/ZKContainer.md
@@ -0,0 +1,23 @@
+# ZKContainer™️
+
+We recommend using [ZKContainer™️](https://ingonyama.com/blog/Immanuel-ZKDC), where we have already preinstalled all the required dependencies, to run Icicle examples. 
+To use our containers you will need [Docker](https://www.docker.com/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html).
+
+In each example directory, ZKContainer™️ files are located in a subdirectory `.devcontainer`. 
+
+- File `Dockerfile` specifies how to build an image of a ZKContainer™️. 
+- File `devcontainer.json` enables running ZKContainer™️ from Visual Studio Code.
+
+## Running ZKContainer™️ from shell
+
+```sh
+docker build -t icicle-example-poseidon -f .devcontainer/Dockerfile .
+```
+
+To run the example interactively, start the container
+
+```sh
+docker run -it --rm --gpus all -v .:/icicle-example icicle-example-poseidon
+```
+
+Inside the container, run the commands for building the library for whichever [build system](../README.md#build-systems) you choose to use. 
--- a/examples/c++/Poseidon-hash/.devcontainer/Dockerfile
+++ b/examples/c++/Poseidon-hash/.devcontainer/Dockerfile
@@ -0,0 +1,40 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    python3-pip \    
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+#RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+#ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Golang
+#ENV GOLANG_VERSION 1.21.1
+#RUN curl -L https://golang.org/dl/go${GOLANG_VERSION}.linux-amd64.tar.gz | tar -xz -C /usr/local
+#ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Install Python dependencies
+RUN pip install poseidon-hash
+
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/Poseidon-hash/.devcontainer/devcontainer.json
+++ b/examples/c++/Poseidon-hash/.devcontainer/devcontainer.json
@@ -0,0 +1,21 @@
+{
+    "name": "Icicle Examples",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python"
+			]
+		}
+	}
+}
--- a/examples/c++/Poseidon-hash/CMakeLists.txt
+++ b/examples/c++/Poseidon-hash/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("/icicle/icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/Poseidon-hash/README.md
+++ b/examples/c++/Poseidon-hash/README.md
@@ -0,0 +1,85 @@
+# Icicle example: build a Merkle tree using Poseidon hash
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template classes to accelerate Zero Knowledge (ZK) applications, for example, a popular [Poseidon hash function](https://www.poseidon-hash.info/).
+Use class `Poseidon` to instantiate and use the hash function
+
+
+### Instantiate hash function
+
+```c++
+Poseidon<BLS12_381::scalar_t> poseidon(arity, stream);
+```
+
+**Parameters:**
+
+- **data class:** Here the hash operates on `BLS12_381::scalar_t`, a scalar field of the curve  `BLS12-381`.
+You can think of field's elements as 32-bytes integers modulo `p`, where `p` is a prime number, specific to this field.
+
+- **arity:** The number of elements in a hashed block.
+
+- **stream:** CUDA streams allow multiple hashes and higher throughput.
+
+### Hash multiple blocks in parallel
+
+```c++
+poseidon.hash_blocks(inBlocks, nBlocks, outHashes, hashType, stream);
+```
+
+**Parameters:**
+
+- **nBlocks:** number of blocks we hash in parallel.
+
+- **inBlocks:** input array of size `arity*nBlocks`. The blocks are arranged sequentially in the array.
+
+- **outHashes:** output array of size `nBlocks`.
+
+- **HashType:** In this example we use `Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree`.
+
+## What's in the example
+
+1. Define the size of the example: the hight of the full binary Merkle tree. 
+2. Hash blocks in parallel. The tree width determines the number of blocks to hash.
+3. Build a Merkle tree from the hashes.
+4. Use the tree to generate a membership proof for one of computed hashes.
+5. Validate the hash membership.
+6. Tamper the hash.
+7. Invalidate the membership of the tempered hash.
+
+## Details
+
+### Merkle tree structure
+
+Our Merkle tree is a **full binary tree** stored in a 1D array.
+The tree nodes are stored following a level-first traversal of the binary tree.
+For a given level, we use offset to number elements from left to right. The node numbers on the figure below correspond to their locations in the array.
+
+
+```
+        Tree        Level
+          0         0 
+        /   \
+       1     2      1
+      / \   / \
+     3   4 5   6    2
+
+1D array representation: {0, 1, 2, 3, 4, 5, 6}
+```
+### Membership proof structure
+
+We use two arrays:
+- position (left/right) of the node along the path toward the root
+- hash of a second node with the same parent 
+
+
+
+
+
+
+
+
--- a/examples/c++/Poseidon-hash/example.cu
+++ b/examples/c++/Poseidon-hash/example.cu
@@ -0,0 +1,134 @@
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+// select the curve
+#include "curves/bls12_381/curve_config.cuh"
+// expose Poseidon classes
+#include "curves/bls12_381/poseidon.cu"
+
+
+
+// location of a tree node in the array for a given level and offset
+inline uint32_t tree_index(uint32_t level, uint32_t offset) {
+    return (1 << level) - 1 + offset;
+}
+
+// We assume the tree has leaves already set, compute all other levels
+void build_tree(const uint32_t tree_height, BLS12_381::scalar_t* tree, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
+  for (uint32_t level = tree_height-1; level>0 ; level-- ) {
+    const uint32_t next_level = level -1;
+    const uint32_t next_level_width = 1 << next_level;
+    poseidon.hash_blocks(&tree[tree_index(level,0)], next_level_width, &tree[tree_index(next_level,0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
+  }  
+}
+
+// search leaves for a given hash, return offset
+uint32_t query_membership(BLS12_381::scalar_t query, BLS12_381::scalar_t* tree, const uint32_t tree_height) {
+  const uint32_t tree_width = (1 << (tree_height-1));
+  for (uint32_t i=0; i<tree_width; i++) {
+    const BLS12_381::scalar_t leaf = tree[tree_index(tree_height - 1, i)];
+    if (leaf == query ) {
+      return i; // found the hash
+    }    
+  }
+  return tree_height; // hash not found 
+}
+
+void generate_proof(uint32_t position, BLS12_381::scalar_t* tree, const uint32_t tree_height, uint32_t* proof_lr, BLS12_381::scalar_t* proof_hash ) {
+  uint32_t level_index = position;
+  for(uint32_t level = tree_height - 1; level > 0; level--) {
+    uint32_t lr;
+    uint32_t neighbour_index;
+    lr = level_index % 2;
+    if (lr == 0) {
+      // left
+      neighbour_index = level_index + 1;
+    } else {
+      // right
+      neighbour_index = level_index - 1;
+    }
+    proof_lr[level] = lr;
+    proof_hash[level] = tree[tree_index(level,neighbour_index)];
+    level_index /= 2;
+  }
+  // the proof must match this:
+  proof_hash[0] = tree[tree_index(0,0)];
+}
+
+uint32_t validate_proof(const BLS12_381::scalar_t hash, const uint32_t tree_height, const uint32_t* proof_lr, const BLS12_381::scalar_t* proof_hash, Poseidon<BLS12_381::scalar_t> &poseidon, cudaStream_t stream) {
+  BLS12_381::scalar_t hashes_in[2], hash_out[1], level_hash;
+  level_hash = hash;
+  for(uint32_t level = tree_height - 1; level > 0; level --) {
+    if(proof_lr[level]==0) {
+      hashes_in[0] = level_hash;
+      hashes_in[1] = proof_hash[level];
+    } else {
+      hashes_in[0] = proof_hash[level];
+      hashes_in[1] = level_hash;
+    }
+    // next level hash
+    poseidon.hash_blocks(hashes_in, 1, hash_out, Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
+    level_hash = hash_out[0];
+  }
+  return proof_hash[0] == level_hash;
+}
+
+int main(int argc, char* argv[])
+{
+
+  std::cout << "1. Defining the size of the example: height of the full binary Merkle tree" << std::endl;
+  const uint32_t tree_height = 21;
+  std::cout << "Tree height: " << tree_height << std::endl;
+  const uint32_t tree_arity = 2;
+  const uint32_t leaf_level = tree_height-1;
+  const uint32_t tree_width = 1 << leaf_level;
+  std::cout << "Tree width: " << tree_width << std::endl;
+  const uint32_t tree_size = (1 << tree_height) - 1;
+  std::cout << "Tree size: " << tree_size << std::endl;
+  BLS12_381::scalar_t* tree = static_cast<BLS12_381::scalar_t*>(malloc(tree_size * sizeof(BLS12_381::scalar_t)));
+
+  std::cout << "2. Hashing blocks in parallel" << std::endl;
+  const uint32_t data_arity = 4;
+  std::cout << "Block size (arity): " << data_arity << std::endl;
+  std::cout << "Initializing blocks..." << std::endl;
+  BLS12_381::scalar_t d = BLS12_381::scalar_t::zero();
+  BLS12_381::scalar_t* data = static_cast<BLS12_381::scalar_t*>(malloc(tree_width * data_arity * sizeof(BLS12_381::scalar_t)));
+  for (uint32_t i = 0; i < tree_width * data_arity; i++) {
+    data[i] = d;
+    d = d + BLS12_381::scalar_t::one();
+  }
+
+  std::cout << "Hashing blocks into tree leaves..." << std::endl;
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  Poseidon<BLS12_381::scalar_t> data_poseidon(data_arity, stream);
+  data_poseidon.hash_blocks(data, tree_width, &tree[tree_index(leaf_level, 0)], Poseidon<BLS12_381::scalar_t>::HashType::MerkleTree, stream);
+
+  std::cout << "3. Building Merkle tree" << std::endl;
+  Poseidon<BLS12_381::scalar_t> tree_poseidon(tree_arity, stream);
+  build_tree(tree_height, tree, tree_poseidon, stream);
+
+  std::cout << "4. Generate membership proof" << std::endl;
+  uint32_t position = tree_width-1;
+  std::cout << "Using the hash for block: " << position << std::endl;
+  BLS12_381::scalar_t query = tree[tree_index(leaf_level, position)];
+  uint32_t query_position = query_membership(query, tree, tree_height);
+  // allocate arrays for the proof
+  uint32_t* proof_lr = static_cast<uint32_t*>(malloc(tree_height * sizeof(uint32_t)));
+  BLS12_381::scalar_t* proof_hash = static_cast<BLS12_381::scalar_t*>(malloc(tree_height * sizeof(BLS12_381::scalar_t)));
+  generate_proof(query_position, tree, tree_height, proof_lr, proof_hash );
+
+  std::cout << "5. Validate the hash membership" << std::endl;
+  uint32_t validated;
+  const BLS12_381::scalar_t hash = tree[tree_index(leaf_level, query_position)];
+  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+  std::cout << "Validated: " << validated << std::endl;
+
+  std::cout << "6. Tamper the hash" << std::endl;
+  const BLS12_381::scalar_t tampered_hash = hash + BLS12_381::scalar_t::one();
+  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, stream);  
+  std::cout << "7. Invalidate tamper hash membership" << std::endl;
+  std::cout << "Validated: " << validated << std::endl;
+  return 0;
+}
--- a/examples/c++/msm/.devcontainer/Dockerfile
+++ b/examples/c++/msm/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/msm/.devcontainer/devcontainer.json
+++ b/examples/c++/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,21 @@
+{
+    "name": "Icicle Examples: msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python"
+			]
+		}
+	}
+}
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("/icicle/icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/msm/README.md
+++ b/examples/c++/msm/README.md
@@ -0,0 +1,49 @@
+# Icicle example: Muli-Scalar Multiplication (MSM)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides CUDA C++ template function `large_msm` to accelerate [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Concise Usage Explanation
+
+First include MSM template, next select the curve, and finally supply the curve types to the template. 
+
+```c++
+#include "appUtils/msm/msm.cu"           // template
+#include "curves/bn254/curve_config.cuh" // curve
+using namespace BN254;
+...
+large_msm<scalar_t, projective_t, affine_t>(scalars,points,size,result,on_device,big_triangle,bucket_factor,stream)
+```
+
+In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` use affine coordinates, and `result` uses projective coordinates.
+
+
+**Parameters:**
+
+- `on_device`: `true` when executed on GPU, otherwise on host
+
+- `big_triangle`: Depreciated. Always set to `false`.
+
+- `bucket_factor`:  distinguishes between large bucket and normal bucket sizes. If there is a scalar distribution that is skewed heavily to a few values we can operate on those separately from the rest of the values. The ideal value here can vary by circuit (based on the distribution of scalars) but start with 10 and adjust to see if it improves performance.
+
+- `stream`: CUDA streams enable parallel execution of multiple functions
+
+## What's in the example
+
+1. Define the parameters of MSM. 
+2. Generate random inputs on-host
+3. Copy inputs on-device
+4. Execute MSM on-device (GPU)
+5. Copy results on-host
+
+
+
+
+
+
+
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -0,0 +1,59 @@
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+
+// include MSM template
+#include "appUtils/msm/msm.cu"
+// select the curve
+#include "curves/bn254/curve_config.cuh"
+
+using namespace BN254;
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
+
+  std::cout << "Example parameters" << std::endl;
+  unsigned msm_size = 1048576;
+  std::cout << "msm_size: " << msm_size << std::endl;
+  unsigned bucket_factor = 10;
+  std::cout << "bucket_factor: " << bucket_factor << std::endl;
+  std::cout << "Generating random inputs on-host" << std::endl;
+  scalar_t* scalars = new scalar_t[msm_size];
+  affine_t* points = new affine_t[msm_size];
+  projective_t result;
+  for (unsigned i = 0; i < msm_size; i++) {
+    points[i] = (i % msm_size < 10) ? projective_t::to_affine(projective_t::rand_host()) : points[i - 10];
+    scalars[i] = scalar_t::rand_host();
+  }
+
+  std::cout << "Preparing inputs on-device" << std::endl;
+  scalar_t* scalars_d;
+  affine_t* points_d;
+  projective_t* result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * msm_size);
+  cudaMalloc(&points_d, sizeof(affine_t) * msm_size);
+  cudaMalloc(&result_d, sizeof(projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * msm_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(affine_t) * msm_size, cudaMemcpyHostToDevice);
+
+  std::cout << "Running MSM on-device" << std::endl;
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  auto begin = std::chrono::high_resolution_clock::now();
+  large_msm<scalar_t, projective_t, affine_t>(scalars_d, points_d, msm_size, result_d, true, false, bucket_factor, stream1);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
+  printf("On-device runtime: %.3f seconds.\n", elapsed.count() * 1e-9);
+  cudaStreamSynchronize(stream1);
+  cudaStreamDestroy(stream1);
+  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
+  std::cout << projective_t::to_affine(result) << std::endl;
+
+  cudaFree(scalars_d);
+  cudaFree(points_d);
+  cudaFree(result_d);
+
+  return 0;
+}
--- a/examples/c++/multiply/.devcontainer/Dockerfile
+++ b/examples/c++/multiply/.devcontainer/Dockerfile
@@ -0,0 +1,23 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use NVIDIA base image
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    nsight-systems-12.2 \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/c++/multiply/.devcontainer/devcontainer.json
+++ b/examples/c++/multiply/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "name": "Icicle Examples - Multiply",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-example,type=bind",
+    "workspaceFolder": "/icicle-example",
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-azuretools.vscode-docker",
+				"ms-vscode.cpptools-extension-pack"
+			]
+		}
+	}
+}
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/multiply/README.md
+++ b/examples/c++/multiply/README.md
@@ -0,0 +1,34 @@
+# Icicle example: Multiplication
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` accelerates multiplication operation `*` using [Karatsuba algorythm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+
+## Concise Usage Explanation
+
+Define a `CURVE_ID` and include curve configuration header:
+
+```c++
+#define CURVE_ID 1
+#include "icicle/curves/curve_config.cuh"
+```
+
+The values of `CURVE_ID` for different curves are in the abobe header. Multiplication is accelerated both for field scalars and point fields.
+
+```c++
+using namespace curve_config;
+scalar_t a;
+point_field_t b;
+```
+
+## What's in the example
+
+1. Define the parameters for the example such as vector size 
+2. Generate random vectors on-host
+3. Copy them on-device
+4. Execute element-wise vector multiplication on-device
+5. Copy results on-host
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -0,0 +1,196 @@
+#include <iostream>
+#include <iomanip> 
+#include <chrono>
+#include <cuda_runtime.h>
+#include <nvml.h>
+
+#define CURVE_ID 1
+#include "/icicle/icicle/curves/curve_config.cuh"
+using namespace curve_config;
+typedef scalar_t T;
+// typedef point_field_t T;
+
+const std::string curve = "BN254";
+
+#define MAX_THREADS_PER_BLOCK 256
+
+template <typename T>
+__global__ void vectorMult(T *vec_a, T *vec_b, T *vec_r, size_t n_elments)
+{
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid < n_elments)
+    {
+        vec_r[tid] = vec_a[tid] * vec_b[tid];
+    }
+}
+
+template <typename T>
+int vector_mult(T *vec_b, T *vec_a, T *vec_result, size_t n_elments)
+{
+    // Set the grid and block dimensions
+    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+    int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+    // Call the kernel to perform element-wise modular multiplication
+    vectorMult<T><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+  const unsigned vector_size = 1 << 20;
+  const unsigned repetitions = 1 << 20; 
+
+  cudaError_t err;
+  nvmlInit();
+  nvmlDevice_t device;
+  nvmlDeviceGetHandleByIndex(0, &device);  // for GPU 0
+  std::cout << "Icicle-Examples: vector multiplications" << std::endl;
+  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
+        std::cout << "GPU Model: " << name << std::endl;
+    } else {
+        std::cerr << "Failed to get GPU model name." << std::endl;
+    }
+
+  unsigned power_limit;
+  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
+  
+  std::cout << "Vector size: " << vector_size << std::endl;
+  std::cout << "Repetitions: " << repetitions << std::endl;
+  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+  
+  unsigned int baseline_power;
+  nvmlDeviceGetPowerUsage(device, &baseline_power);
+  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
+  unsigned baseline_temperature;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
+    std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // host data
+  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
+  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "Initializing vectors with random data" << std::endl;
+  for (int i = 0; i < vector_size; i++) {
+    if ( (i>0) && i % (1<<20) == 0) 
+      std::cout << "Elements: " << i << std::endl;
+    host_in1[i] = T::rand_host();
+    host_in2[i] = T::rand_host();
+  }
+
+  // device data
+  T* device_in1;
+  T* device_in2;
+  T* device_out;
+
+  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+
+  if (err != cudaSuccess) {
+      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+      return 0;
+  }
+
+  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+
+  if (err != cudaSuccess) {
+      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+      return 0;
+  }
+
+  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+
+  if (err != cudaSuccess) {
+      std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+      return 0;
+  }
+
+  // copy from host to device
+  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+
+  if (err != cudaSuccess) {
+      std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+      return 0;
+  }
+
+  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+
+  if (err != cudaSuccess) {
+      std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+      return 0;
+  }
+
+  
+  std::cout << "Starting warm-up" << std::endl;
+  // Warm-up loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size);
+    // err = lde::Mul(device_in1, device_in2, vector_size, is_on_device, is_montgomery, ctx, device_out);
+    // if (err != cudaSuccess) {
+    //   std::cerr << "Failed to call lde::Mul" << cudaGetErrorString(err) << std::endl;
+    //   return 0;
+    // }
+  }
+
+  std::cout << "Starting benchmarking" << std::endl;
+  unsigned power_before;
+  nvmlDeviceGetPowerUsage(device, &power_before);
+  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_before / power_limit << " %" << std::endl;
+  unsigned temperature_before;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+  auto start_time = std::chrono::high_resolution_clock::now();
+
+  // Benchmark loop
+  for (int i = 0; i < repetitions; i++) {
+    vector_mult(device_in1, device_in2, device_out, vector_size);
+    // err = lde::Mul(device_in1, device_in2, vector_size, is_on_device, is_montgomery, ctx, device_out);
+    // if (err != cudaSuccess) {
+    //   std::cerr << "Failed to call lde::Mul" << cudaGetErrorString(err) << std::endl;
+    //   return 0;
+    // }
+  }
+  
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+  std::cout << "Elapsed time: " << duration.count() << " microseconds" << std::endl;
+  unsigned power_after;
+  nvmlDeviceGetPowerUsage(device, &power_after);
+  std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float) 100.0 * power_after / power_limit << " %" << std::endl;
+  unsigned temperature_after;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // Report performance in GMPS: Giga Multiplications Per Second
+  double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()) ;
+  std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
+
+  // Optional: validate multiplication
+  T * host_out = (T*)malloc(vector_size * sizeof(T));
+
+  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+
+  // validate multiplication here...
+
+  free(host_in1); 
+  free(host_in2);
+  free(host_out); 
+  cudaFree(device_in1);
+  cudaFree(device_in2);
+  cudaFree(device_out);
+
+  nvmlShutdown();
+
+  return 0;
+  
+}
--- a/examples/c++/multiply/run.sh
+++ b/examples/c++/multiply/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+/icicle-example/build/example
--- a/examples/c++/ntt/.devcontainer/Dockerfile
+++ b/examples/c++/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
--- a/examples/c++/ntt/.devcontainer/devcontainer.json
+++ b/examples/c++/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+		}
+	}
+}
--- a/examples/c++/ntt/CMakeLists.txt
+++ b/examples/c++/ntt/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("/icicle")
+add_executable(
+  example
+  example.cu
+)
+
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda-12.0/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
--- a/examples/c++/ntt/README.md
+++ b/examples/c++/ntt/README.md
@@ -0,0 +1,36 @@
+# Icicle example: Number-Theoretical Transform (NTT)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` provides several CUDA C++ template functions for [Number Theoretical Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md), also known as Discrete Fourier Transform. The templates differ in terms of ease-of-use vs. speed. In this example we look a the easiest one.
+
+## Concise Usage Explanation
+
+First include NTT template, next select the curve, and finally supply the curve types to the template. 
+
+```c++
+#include "icicle/appUtils/ntt/ntt.cuh"              // template
+#include "icicle/curves/bls12_381/curve_config.cuh" // curve
+using namespace BLS12_381;
+...
+ntt_end2end_batch_template<scalar_t, scalar_t>(scalars, batch_size, ntt_size, inverse, stream);
+```
+
+In this example we use `BLS12_381` curve. The function computes TODO.
+
+
+**Parameters:**
+
+TODO
+
+
+## What's in the example
+
+TODO
+
+
+
--- a/examples/c++/ntt/compile.sh
+++ b/examples/c++/ntt/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
--- a/examples/c++/ntt/example.cu
+++ b/examples/c++/ntt/example.cu
@@ -0,0 +1,119 @@
+#include <chrono>
+#include <iostream>
+
+
+// include NTT template
+#include "icicle/appUtils/ntt/ntt.cuh"
+
+// select the curve
+#include "icicle/curves/bls12_381/curve_config.cuh"
+using namespace BLS12_381;
+
+// Operate on scalars
+typedef scalar_t S;
+typedef scalar_t E;
+
+scalar_t smult(const unsigned n, scalar_t s) {
+  scalar_t r = scalar_t::zero();
+  for (unsigned i=0; i < n; i++) {
+    r = r + s;
+  }
+  return r;
+}
+
+void initialize_input(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+  // Harmonics 0
+  for (unsigned i = 0; i < ntt_size; i=i+1) {
+    elements[i] = scalar_t::one();
+  }
+  // Harmonics 1
+  for (unsigned i = 1*ntt_size; i < 2*ntt_size; i=i+2) {
+    elements[i] = scalar_t::one();
+    elements[i+1] = scalar_t::neg(scalar_t::one());
+  }
+}
+
+int validate_output(const unsigned ntt_size, const unsigned nof_ntts, E * elements ) {
+  int nof_errors = 0;
+  E amplitude = smult(ntt_size,scalar_t::one());
+  // std::cout << "Amplitude: " << amplitude << std::endl;
+  // Harmonics 0
+  if (elements[0] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in harmonics 0: " << elements[0] << std::endl;
+  } else {
+    std::cout << "Validated harmonics 0" << std::endl;
+  }
+  // Harmonics 1
+  if (elements[ntt_size+1] != amplitude) {
+    ++nof_errors;
+    std::cout << "Error in harmonics 1: " << elements[ntt_size+1] << std::endl;
+  } else {
+    std::cout << "Validated harmonics 1" << std::endl;
+  }
+  // for (unsigned i = 0; i < nof_ntts * ntt_size; i++) {
+  //   std::cout << elements[i] << std::endl;   
+  // }
+  return nof_errors;
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Icicle Examples: Number Theoretical Transform (NTT)" << std::endl;
+  std::cout << "Example parameters" << std::endl;
+  const unsigned log_ntt_size = 26;
+  std::cout << "Log2(NTT size): " << log_ntt_size << std::endl;
+  const unsigned ntt_size = 1 << log_ntt_size;
+  std::cout << "NTT size: " << ntt_size << std::endl;
+  const unsigned nof_ntts = 2;
+  std::cout << "Number of NTTs: " << nof_ntts << std::endl;
+  const unsigned batch_size = nof_ntts * ntt_size;
+  
+  std::cout << "Generating input data for harmonics 0,1" << std::endl;
+  E* elements;
+  elements = (scalar_t*) malloc(sizeof(E) * batch_size);
+  initialize_input(ntt_size, nof_ntts, elements );
+  
+  std::cout << "Running easy-to-use NTT" << std::endl;
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  
+  bool inverse = false;
+  auto begin0 = std::chrono::high_resolution_clock::now();
+  ntt_end2end_batch_template<scalar_t, scalar_t>(elements, batch_size, ntt_size, inverse, stream);
+  auto end0 = std::chrono::high_resolution_clock::now();
+  auto elapsed0 = std::chrono::duration_cast<std::chrono::nanoseconds>(end0 - begin0);
+  printf("On-device runtime: %.3f seconds\n", elapsed0.count() * 1e-9);
+  validate_output(ntt_size, nof_ntts, elements );
+  cudaStreamSynchronize(stream);
+
+  std::cout << "Running not that easy-to-use but fast NTT" << std::endl;
+  
+  uint32_t n_twiddles = ntt_size; // n_twiddles is set to 4096 as BLS12_381::scalar_t::omega() is of that order.
+  // represent transform matrix using twiddle factors
+  scalar_t * d_twiddles;
+  d_twiddles = fill_twiddle_factors_array(n_twiddles, scalar_t::omega(log_ntt_size), stream); // Sscalar
+  scalar_t* d_elements; // Element
+  
+  cudaMallocAsync(&d_elements, sizeof(scalar_t) * batch_size, stream);
+  initialize_input(ntt_size, nof_ntts, elements );
+  cudaMemcpyAsync(d_elements, elements, sizeof(scalar_t) * batch_size, cudaMemcpyHostToDevice, stream);
+  S* _null = nullptr;
+  auto begin1 = std::chrono::high_resolution_clock::now();
+  cudaStreamSynchronize(stream);
+  ntt_inplace_batch_template(d_elements, d_twiddles, ntt_size, nof_ntts, inverse, false, _null, stream, false);
+  cudaStreamSynchronize(stream);
+  auto end1 = std::chrono::high_resolution_clock::now();
+  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("Runtime: %.3e seconds\n", elapsed1.count() * 1e-9);
+  
+  cudaMemcpyAsync(elements, d_elements, sizeof(E) * batch_size, cudaMemcpyDeviceToHost, stream);
+  validate_output(ntt_size, nof_ntts, elements );
+  cudaFreeAsync(d_elements, stream);
+  cudaFreeAsync(d_twiddles, stream);
+  
+  cudaStreamDestroy(stream);
+
+  free(elements);
+  return 0;
+}
--- a/examples/rust/msm/.devcontainer/Dockerfile
+++ b/examples/rust/msm/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/msm/.devcontainer/devcontainer.json
+++ b/examples/rust/msm/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust msm",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "msm"
+version = "1.0.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+
+ark-bn254 = { version = "0.4.0", optional = true}
+ark-bls12-377 = { version = "0.4.0", optional = true}
+ark-ec = { version = "0.4.0", optional = true}
+clap = { version = "4.4.12", features = ["derive"] }
+
+[features]
+arkworks = ["ark-bn254", "ark-bls12-377", "ark-ec", "icicle-core/arkworks", "icicle-bn254/arkworks", "icicle-bls12-377/arkworks"]
+profile = []
--- a/examples/rust/msm/README.md
+++ b/examples/rust/msm/README.md
@@ -0,0 +1,56 @@
+# ICICLE example: MultiScalar Multiplication (MSM) in Rust
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Multi-Scalar Multiplication](https://github.com/ingonyama-zk/ingopedia/blob/master/src/msm.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer™️](../../ZKContainer.md).
+
+## Usage
+
+```rust
+msm(
+  /* Scalars input vector */ &scalars, 
+  /* Points input vector */ &points, 
+  /* MSMConfig reference */ &cfg, 
+  /* Projective point result */ &mut msm_results.as_slice()
+).unwrap();
+```
+In this example we use `BN254` curve. The function computes $result = \sum_{i=0}^{size-1} scalars[i] \cdot points[i]$, where input `points[]` uses affine coordinates, and `result` uses projective coordinates.
+
+## What's in the example
+
+1. Define the size of MSM. 
+2. Generate random inputs on-device
+3. Configure MSM
+4. Execute MSM on-device
+5. Move the result on host
+
+Running the example:
+```sh
+cargo run --release
+```
+
+You can add the `--feature arkworks,profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default sizes are 2^19 - 2^23. You can change this by passing the `--lower_bound_log_size <size> --upper_bound_log_size <size>` options. To change the size range to 2^21 - 2^24, run the example like this:
+> ```sh
+> cargo run --release -- -l 21 -u 24
+> ```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 10 ms | 11 ms | 21 ms | 39 ms | 77 ms |
+| Arkworks | 284 ms | 540 ms | 1,152 ms | 2,320 ms | 4,491 ms |
+
+### Single BLS12377 MSM
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 9 ms | 14 ms | 25 ms | 48 ms | 93 ms |
+| Arkworks | 490 ms | 918 ms | 1,861 ms | 3,624 ms | 7,191 ms |
--- a/examples/rust/msm/src/main.rs
+++ b/examples/rust/msm/src/main.rs
@@ -0,0 +1,174 @@
+use icicle_bn254::curve::{
+    CurveCfg,
+    ScalarCfg,
+    G1Projective
+};
+
+use icicle_bls12_377::curve::{
+    CurveCfg as BLS12377CurveCfg,
+    ScalarCfg as BLS12377ScalarCfg,
+    G1Projective as BLS12377G1Projective
+};
+
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::DeviceSlice
+};
+
+use icicle_core::{
+    msm,
+    curve::Curve,
+    traits::GenerateRandom
+};
+
+#[cfg(feature = "arkworks")]
+use icicle_core::traits::ArkConvertible;
+
+#[cfg(feature = "arkworks")]
+use ark_bn254::{
+    G1Projective as Bn254ArkG1Projective,
+    G1Affine as Bn254G1Affine,
+    Fr as Bn254Fr
+};
+#[cfg(feature = "arkworks")]
+use ark_bls12_377::{
+    G1Projective as Bls12377ArkG1Projective,
+    G1Affine as Bls12377G1Affine,
+    Fr as Bls12377Fr
+};
+#[cfg(feature = "arkworks")]
+use ark_ec::scalar_mul::variable_base::VariableBaseMSM;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Lower bound (inclusive) of MSM sizes to run for
+    #[arg(short, long, default_value_t = 19)]
+    lower_bound_log_size: u8,
+
+    /// Upper bound of MSM sizes to run for
+    #[arg(short, long, default_value_t = 23)]
+    upper_bound_log_size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    let lower_bound = args.lower_bound_log_size;
+    let upper_bound = args.upper_bound_log_size;
+    println!("Running Icicle Examples: Rust MSM");
+    let upper_size = 1 << (upper_bound);
+    println!("Generating random inputs on host for bn254...");
+    let upper_points = CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars = ScalarCfg::generate_random(upper_size);
+    
+    println!("Generating random inputs on host for bls12377...");
+    let upper_points_bls12377 = BLS12377CurveCfg::generate_random_affine_points(upper_size);
+    let upper_scalars_bls12377 = BLS12377ScalarCfg::generate_random(upper_size);
+
+    for i in lower_bound..=upper_bound { 
+        let log_size = i;
+        let size = 1 << log_size;
+        println!("---------------------- MSM size 2^{}={} ------------------------", log_size, size);
+        // Setting Bn254 points and scalars
+        let points = &upper_points[..size];
+        let scalars = &upper_scalars[..size];
+        
+        // Setting bls12377 points and scalars
+        let points_bls12377 = &upper_points_bls12377[..size];
+        let scalars_bls12377 = &upper_scalars_bls12377[..size];
+
+        println!("Configuring bn254 MSM...");
+        let mut msm_results: DeviceSlice<'_, G1Projective> = DeviceSlice::cuda_malloc(1).unwrap();
+        let stream = CudaStream::create().unwrap();
+        let mut cfg = msm::get_default_msm_config::<CurveCfg>();
+        cfg.ctx.stream = &stream;
+        cfg.is_async = true;
+        cfg.are_results_on_device = true;
+
+        println!("Configuring bls12377 MSM...");
+        let mut msm_results_bls12377: DeviceSlice<'_, BLS12377G1Projective> = DeviceSlice::cuda_malloc(1).unwrap();
+        let stream_bls12377 = CudaStream::create().unwrap();
+        let mut cfg_bls12377 = msm::get_default_msm_config::<BLS12377CurveCfg>();
+        cfg_bls12377.ctx.stream = &stream_bls12377;
+        cfg_bls12377.is_async = true;
+        cfg_bls12377.are_results_on_device = true;
+
+        println!("Executing bn254 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(&scalars, &points, &cfg, &mut msm_results.as_slice()).unwrap();
+        #[cfg(feature = "profile")]
+        println!("ICICLE BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+        println!("Executing bls12377 MSM on device...");
+        #[cfg(feature = "profile")]
+        let start = Instant::now();
+        msm::msm(&scalars_bls12377, &points_bls12377, &cfg_bls12377, &mut msm_results_bls12377.as_slice()).unwrap();
+        #[cfg(feature = "profile")]
+        println!("ICICLE BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+        println!("Moving results to host..");
+        let mut msm_host_result = vec![G1Projective::zero(); 1];
+        let mut msm_host_result_bls12377 = vec![BLS12377G1Projective::zero(); 1];
+        
+        stream
+            .synchronize()
+            .unwrap();
+        msm_results
+            .copy_to_host(&mut msm_host_result[..])
+            .unwrap();
+        println!("bn254 result: {:#?}", msm_host_result);
+        
+        stream_bls12377
+            .synchronize()
+            .unwrap();
+        msm_results_bls12377
+            .copy_to_host(&mut msm_host_result_bls12377[..])
+            .unwrap();
+        println!("bls12377 result: {:#?}", msm_host_result_bls12377);
+        
+        #[cfg(feature = "arkworks")]
+        {
+            println!("Checking against arkworks...");
+            let ark_points: Vec<Bn254G1Affine> = points.iter().map(|&point| point.to_ark()).collect();
+            let ark_scalars: Vec<Bn254Fr> = scalars.iter().map(|scalar| scalar.to_ark()).collect();
+
+            let ark_points_bls12377: Vec<Bls12377G1Affine> = points_bls12377.iter().map(|point| point.to_ark()).collect();
+            let ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.iter().map(|scalar| scalar.to_ark()).collect();
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bn254_ark_msm_res = Bn254ArkG1Projective::msm(&ark_points, &ark_scalars).unwrap();
+            println!("Arkworks Bn254 result: {:#?}", bn254_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!("Ark BN254 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+            #[cfg(feature = "profile")]
+            let start = Instant::now();
+            let bls12377_ark_msm_res = Bls12377ArkG1Projective::msm(&ark_points_bls12377, &ark_scalars_bls12377).unwrap();
+            println!("Arkworks Bls12377 result: {:#?}", bls12377_ark_msm_res);
+            #[cfg(feature = "profile")]
+            println!("Ark BLS12377 MSM on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+            let bn254_icicle_msm_res_as_ark = msm_host_result[0].to_ark();
+            let bls12377_icicle_msm_res_as_ark = msm_host_result_bls12377[0].to_ark();
+
+            println!("Bn254 MSM is correct: {}", bn254_ark_msm_res.eq(&bn254_icicle_msm_res_as_ark));
+            println!("Bls12377 MSM is correct: {}", bls12377_ark_msm_res.eq(&bls12377_icicle_msm_res_as_ark));
+        }
+        
+        println!("Cleaning up bn254...");
+        stream
+            .destroy()
+            .unwrap();
+        println!("Cleaning up bls12377...");
+        stream_bls12377
+            .destroy()
+            .unwrap();
+        println!("");
+    }
+}
--- a/examples/rust/ntt/.devcontainer/Dockerfile
+++ b/examples/rust/ntt/.devcontainer/Dockerfile
@@ -0,0 +1,27 @@
+# Use the specified base image
+#FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    llvm \
+    clang \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Copy the content of the local directory to the working directory
+COPY . .
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
--- a/examples/rust/ntt/.devcontainer/devcontainer.json
+++ b/examples/rust/ntt/.devcontainer/devcontainer.json
@@ -0,0 +1,23 @@
+{
+    "name": "Icicle Examples: rust ntt",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-azuretools.vscode-docker",
+                "rust-lang.rust-analyzer",
+                "vadimcn.vscode-lldb"
+            ]
+		}
+	}
+}
--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "ntt"
+version = "1.0.0"
+edition = "2018"
+
+[dependencies]
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev" }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
+icicle-bls12-377 = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "dev", features = ["arkworks"] }
+
+ark-ff = { version = "0.4.0" }
+ark-poly = "0.4.0"
+ark-std = "0.4.0"
+ark-bn254 = { version = "0.4.0" }
+ark-bls12-377 = { version = "0.4.0" }
+clap = { version = "4.4.12", features = ["derive"] }
+
+[features]
+profile = []
--- a/examples/rust/ntt/README.md
+++ b/examples/rust/ntt/README.md
@@ -0,0 +1,58 @@
+# ICICLE example: Number Theoretic Transform (NTT) in Rust
+
+`ICICLE` provides Rust bindings to CUDA-accelerated C++ implementation of [Number Theoretic Transform](https://github.com/ingonyama-zk/ingopedia/blob/master/src/fft.md).
+
+## Best Practices
+
+In order to save time and setting up prerequisites manually, we recommend running this example in our [ZKContainer™️](../../ZKContainer.md).
+
+## Usage
+
+```rust
+ntt::ntt(
+  /* input slice */ scalars.as_slice(),
+  /* NTT Direction */ ntt::NTTDir::kForward,
+  /* NTT Configuration */ &cfg,
+  /* output slice */ ntt_results.as_slice()
+).unwrap();
+```
+In this example we use the `BN254` and `BLS12377` fields.
+
+## What's in this example
+
+1. Define the size of NTT. 
+2. Generate random inputs on-host
+4. Set up the domain.
+3. Configure NTT
+4. Execute NTT on-device
+5. Move the result on host
+6. Compare results with arkworks
+
+Running the example:
+```sh
+cargo run --release
+```
+
+You can add the `--feature profile` flag to measure times of both ICICLE and arkworks.
+
+> [!NOTE]
+> The default size is 2^20. You can change this by passing the `--size <size>` option. To change the size to 2^23, run the example like this:
+> ```sh
+> cargo run --release -- -s 23
+> ```
+
+## Benchmarks
+
+These benchmarks were run on a 16 core 24 thread i9-12900k CPU and an RTX 3090 Ti GPU
+
+### Single BN254 NTT
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.263 ms | 2.986 ms | 4.651 ms | 9.308 ms | 18.618 ms |
+| Arkworks | 138 ms | 290 ms | 611 ms | 1,295 ms | 2,715 ms |
+
+### Single BLS12377 NTT
+| Library\Size | 2^19 | 2^20 | 2^21 | 2^22 | 2^23 |
+|--------------|------|------|------|------|------|
+| ICICLE | 1.272 ms | 2.893 ms | 4.728 ms | 9.211 ms | 18.319 ms |
+| Arkworks | 135 ms | 286 ms | 605 ms | 1,279 ms | 2,682 ms |
--- a/examples/rust/ntt/src/main.rs
+++ b/examples/rust/ntt/src/main.rs
@@ -0,0 +1,159 @@
+use icicle_bn254::curve::{
+    ScalarCfg,
+    ScalarField,
+};
+
+use icicle_bls12_377::curve::{
+    ScalarCfg as BLS12377ScalarCfg,
+    ScalarField as BLS12377ScalarField
+};
+
+use icicle_cuda_runtime::{
+    stream::CudaStream,
+    memory::DeviceSlice,
+    device_context::get_default_device_context
+};
+
+use icicle_core::{
+    ntt::{self, NTT},
+    traits::{GenerateRandom, FieldImpl}
+};
+
+use icicle_core::traits::ArkConvertible;
+
+use ark_bn254::Fr as Bn254Fr;
+use ark_bls12_377::Fr as Bls12377Fr;
+use ark_ff::FftField;
+use ark_poly::{EvaluationDomain, Radix2EvaluationDomain};
+use ark_std::cmp::{Ord, Ordering};
+use std::convert::TryInto;
+
+#[cfg(feature = "profile")]
+use std::time::Instant;
+
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Size of NTT to run (20 for 2^20)
+    #[arg(short, long, default_value_t = 20)]
+    size: u8,
+}
+
+fn main() {
+    let args = Args::parse();
+    println!("Running Icicle Examples: Rust NTT");
+    let log_size = args.size;
+    let size = 1 << log_size;
+    println!("---------------------- NTT size 2^{}={} ------------------------", log_size, size);
+    // Setting Bn254 points and scalars
+    println!("Generating random inputs on host for bn254...");
+    let scalars = ScalarCfg::generate_random(size);
+    let mut ntt_results: DeviceSlice<'_, ScalarField> = DeviceSlice::cuda_malloc(size).unwrap();
+    
+    // Setting bls12377 points and scalars
+    println!("Generating random inputs on host for bls12377...");
+    let scalars_bls12377 = BLS12377ScalarCfg::generate_random(size);
+    let mut ntt_results_bls12377: DeviceSlice<'_, BLS12377ScalarField> = DeviceSlice::cuda_malloc(size).unwrap();
+    
+    println!("Setting up bn254 Domain...");
+    let icicle_omega = <Bn254Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
+    let ctx = get_default_device_context();
+    ScalarCfg::initialize_domain(ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bn254 NTT...");
+    let stream = CudaStream::create().unwrap();
+    let mut cfg = ntt::get_default_ntt_config::<ScalarField>();
+    cfg.ctx.stream = &stream;
+    cfg.is_async = true;
+    cfg.are_outputs_on_device = true;
+
+    println!("Setting up bls12377 Domain...");
+    let icicle_omega = <Bls12377Fr as FftField>::get_root_of_unity(size.try_into().unwrap()).unwrap();
+    // reusing ctx from above
+    BLS12377ScalarCfg::initialize_domain(BLS12377ScalarField::from_ark(icicle_omega), &ctx).unwrap();
+
+    println!("Configuring bls12377 NTT...");
+    let stream_bls12377 = CudaStream::create().unwrap();
+    let mut cfg_bls12377 = ntt::get_default_ntt_config::<BLS12377ScalarField>();
+    cfg_bls12377.ctx.stream = &stream_bls12377;
+    cfg_bls12377.is_async = true;
+    cfg_bls12377.are_outputs_on_device = true;
+
+    println!("Executing bn254 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(scalars.as_slice(), ntt::NTTDir::kForward, &cfg, ntt_results.as_slice()).unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BN254 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());
+
+    println!("Executing bls12377 NTT on device...");
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    ntt::ntt(scalars_bls12377.as_slice(), ntt::NTTDir::kForward, &cfg_bls12377, ntt_results_bls12377.as_slice()).unwrap();
+    #[cfg(feature = "profile")]
+    println!("ICICLE BLS12377 NTT on size 2^{log_size} took: {} μs", start.elapsed().as_micros());
+
+    println!("Moving results to host..");
+    stream
+        .synchronize()
+        .unwrap();
+    let mut host_bn254_results = vec![ScalarField::zero(); size];
+    ntt_results
+        .copy_to_host(&mut host_bn254_results[..])
+        .unwrap();
+    
+    stream_bls12377
+        .synchronize()
+        .unwrap();
+    let mut host_bls12377_results = vec![BLS12377ScalarField::zero(); size];
+    ntt_results_bls12377
+        .copy_to_host(&mut host_bls12377_results[..])
+        .unwrap();
+    
+    println!("Checking against arkworks...");
+    let mut ark_scalars: Vec<Bn254Fr> = scalars.iter().map(|scalar| scalar.to_ark()).collect();
+    let bn254_domain = <Radix2EvaluationDomain<Bn254Fr> as EvaluationDomain<Bn254Fr>>::new(size).unwrap();
+    
+    let mut ark_scalars_bls12377: Vec<Bls12377Fr> = scalars_bls12377.iter().map(|scalar| scalar.to_ark()).collect();
+    let bls12_377_domain = <Radix2EvaluationDomain<Bls12377Fr> as EvaluationDomain<Bls12377Fr>>::new(size).unwrap();
+    
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bn254_domain.fft_in_place(&mut ark_scalars);
+    #[cfg(feature = "profile")]
+    println!("Ark BN254 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+    #[cfg(feature = "profile")]
+    let start = Instant::now();
+    bls12_377_domain.fft_in_place(&mut ark_scalars_bls12377);
+    #[cfg(feature = "profile")]
+    println!("Ark BLS12377 NTT on size 2^{log_size} took: {} ms", start.elapsed().as_millis());
+
+    host_bn254_results
+        .iter()
+        .zip(ark_scalars.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+    println!("Bn254 NTT is correct");
+    
+    host_bls12377_results
+        .iter()
+        .zip(ark_scalars_bls12377.iter())
+        .for_each(|(icicle_scalar, &ark_scalar)| {
+            assert_eq!(ark_scalar.cmp(&icicle_scalar.to_ark()), Ordering::Equal);
+        });
+
+    println!("Bls12377 NTT is correct");
+    
+    println!("Cleaning up bn254...");
+    stream
+        .destroy()
+        .unwrap();
+    println!("Cleaning up bls12377...");
+    stream_bls12377
+        .destroy()
+        .unwrap();
+    println!("");
+}