Bump rust crates' version

icicle-babybear@2.8.0 icicle-bls12-377@2.8.0 icicle-bls12-381@2.8.0 icicle-bn254@2.8.0 icicle-bw6-761@2.8.0 icicle-core@2.8.0 icicle-cuda-runtime@2.8.0 icicle-grumpkin@2.8.0 icicle-hash@2.8.0 icicle-m31@2.8.0 icicle-stark252@2.8.0 Generated by cargo-workspaces
Feat/roman/hash docs (#556 )
2026-01-13 09:27:58 -05:00 · 2024-07-16 13:57:56 +00:00 · 2024-07-16 16:39:35 +03:00
17 changed files with 285 additions and 301 deletions
--- a/docs/docs/icicle/primitives/keccak.md
+++ b/docs/docs/icicle/primitives/keccak.md
@@ -14,9 +14,62 @@ At its core, Keccak consists of a permutation function operating on a state arra

 ## Using Keccak

-ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree.
+ICICLE Keccak supports batch hashing, which can be utilized for constructing a merkle tree or running multiple hashes in parallel.

 ### Supported Bindings

 - [Golang](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/golang/hash/keccak)
- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
+- [Rust](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-hash)
+
+### Example usage
+
+This is an example of running 1024 Keccak-256 hashes in parallel, where input strings are of size 136 bytes:
+
+```rust
+use icicle_core::hash::HashConfig;
+use icicle_cuda_runtime::memory::HostSlice;
+use icicle_hash::keccak::keccak256;
+
+let config = HashConfig::default();
+let input_block_len = 136;
+let number_of_hashes = 1024;
+
+let preimages = vec![1u8; number_of_hashes * input_block_len];
+let mut digests = vec![0u8; number_of_hashes * 64];
+
+let preimages_slice = HostSlice::from_slice(&preimages);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+keccak256(
+    preimages_slice,
+    input_block_len as u32,
+    number_of_hashes as u32,
+    digests_slice,
+    &config,
+)
+.unwrap();
+```
+
+### Merkle Tree
+
+You can build a keccak merkle tree using the corresponding functions:
+
+```rust
+use icicle_core::tree::{merkle_tree_digests_len, TreeBuilderConfig};
+use icicle_cuda_runtime::memory::HostSlice;
+use icicle_hash::keccak::build_keccak256_merkle_tree;
+
+let mut config = TreeBuilderConfig::default();
+config.arity = 2;
+let height = 22;
+let input_block_len = 136;
+let leaves = vec![1u8; (1 << height) * input_block_len];
+let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
+
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+build_keccak256_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
+```
+
+In the example above, a binary tree of height 22 is being built. Each leaf is considered to be a 136 byte long array. The leaves and digests are aligned in a flat array. You can also use keccak512 in `build_keccak512_merkle_tree` function.
--- a/docs/docs/icicle/primitives/poseidon.md
+++ b/docs/docs/icicle/primitives/poseidon.md
@@ -53,6 +53,7 @@ So for Poseidon of arity 2 and input of size 1024 * 2, we would expect 1024 elem

 ### Supported Bindings

+[`Go`](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/golang/curves/bn254/poseidon/poseidon.go)
 [`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon)

 ### Constants
@@ -91,8 +92,6 @@ primitive_element = 7 # bls12-381
 # primitive_element = 15 # bw6-761
 ```

-We only support `alpha = 5` so if you want to use another alpha for S-box please reach out on discord or open a github issue.
-
 ### Rust API

 This is the most basic way to use the Poseidon API.
@@ -101,71 +100,58 @@ This is the most basic way to use the Poseidon API.
 let test_size = 1 << 10;
 let arity = 2u32;
 let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
-let config = PoseidonConfig::default();
+let poseidon = Poseidon::load(arity, &ctx).unwrap();
+let config = HashConfig::default();

 let inputs = vec![F::one(); test_size * arity as usize];
 let outputs = vec![F::zero(); test_size];
 let mut input_slice = HostOrDeviceSlice::on_host(inputs);
 let mut output_slice = HostOrDeviceSlice::on_host(outputs);

-poseidon_hash_many::<F>(
+poseidon.hash_many::<F>(
    &mut input_slice,
    &mut output_slice,
    test_size as u32,
    arity as u32,
-    &constants,
+    1, // Output length
    &config,
 )
 .unwrap();
 ```

-The `PoseidonConfig::default()` can be modified, by default the inputs and outputs are set to be on `Host` for example.
+The `HashConfig` can be modified, by default the inputs and outputs are set to be on `Host` for example.

 ```rust
-impl<'a> Default for PoseidonConfig<'a> {
+impl<'a> Default for HashConfig<'a> {
    fn default() -> Self {
        let ctx = get_default_device_context();
        Self {
            ctx,
            are_inputs_on_device: false,
            are_outputs_on_device: false,
-            input_is_a_state: false,
-            aligned: false,
-            loop_state: false,
            is_async: false,
        }
    }
 }
 ```

-In the example above `load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.
+In the example above `Poseidon::load(arity, &ctx).unwrap();` is used which will load the correct constants based on arity and curve. Its possible to [generate](#constants) your own constants and load them.

 ```rust
 let ctx = get_default_device_context();
-    let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
-    let constants_file = PathBuf::from(cargo_manifest_dir)
-        .join("tests")
-        .join(format!("{}_constants.bin", field_prefix));
-    let mut constants_buf = vec![];
-    File::open(constants_file)
-        .unwrap()
-        .read_to_end(&mut constants_buf)
-        .unwrap();
-
-    let mut custom_constants = vec![];
-    for chunk in constants_buf.chunks(field_bytes) {
-        custom_constants.push(F::from_bytes_le(chunk));
-    }
-
-    let custom_constants = create_optimized_poseidon_constants::<F>(
-        arity as u32,
-        &ctx,
-        full_rounds_half,
-        partial_rounds,
-        &mut custom_constants,
-    )
-    .unwrap();
+let custom_poseidon = Poseidon::new(
+    arity, // The arity of poseidon hash. The width will be equal to arity + 1
+    alpha, // The S-box power
+    full_rounds_half,
+    partial_rounds,
+    round_constants,
+    mds_matrix, 
+    non_sparse_matrix,
+    sparse_matrices,
+    domain_tag,
+    ctx,
+)
+.unwrap();
 ```

 ## The Tree Builder
@@ -175,21 +161,34 @@ The tree builder allows you to build Merkle trees using Poseidon.
 You can define both the tree's `height` and its `arity`. The tree `height` determines the number of layers in the tree, including the root and the leaf layer. The `arity` determines how many children each internal node can have.

 ```rust
-let height = 20;
-let arity = 2;
-let leaves = vec![F::one(); 1 << (height - 1)];
-let mut digests = vec![F::zero(); merkle_tree_digests_len(height, arity)];
-
-let mut leaves_slice = HostOrDeviceSlice::on_host(leaves);
-
-let ctx = get_default_device_context();
-let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
+use icicle_bn254::tree::Bn254TreeBuilder;
+use icicle_bn254::poseidon::Poseidon;

 let mut config = TreeBuilderConfig::default();
-config.keep_rows = 1;
-build_poseidon_merkle_tree::<F>(&mut leaves_slice, &mut digests, height, arity, &constants, &config).unwrap();
+let arity = 2;
+config.arity = arity as u32;
+let input_block_len = arity;
+let leaves = vec![F::one(); (1 << height) * arity];
+let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];

-println!("Root: {:?}", digests[0..1][0]);
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+let ctx = device_context::DeviceContext::default();
+let hash = Poseidon::load(2, &ctx).unwrap();
+
+let mut config = TreeBuilderConfig::default();
+config.keep_rows = 5;
+Bn254TreeBuilder::build_merkle_tree(
+    leaves_slice,
+    digests_slice,
+    height,
+    input_block_len,
+    &hash,
+    &hash,
+    &config,
+)
+.unwrap();
 ```

 Similar to Poseidon, you can also configure the Tree Builder `TreeBuilderConfig::default()`
--- a/docs/docs/icicle/primitives/poseidon2.md
+++ b/docs/docs/icicle/primitives/poseidon2.md
@@ -0,0 +1,88 @@
+# Poseidon2
+
+[Poseidon2](https://eprint.iacr.org/2023/323) is a recently released optimized version of Poseidon1. The two versions differ in two crucial points. First, Poseidon is a sponge hash function, while Poseidon2 can be either a sponge or a compression function depending on the use case. Secondly, Poseidon2 is instantiated by new and more efficient linear layers with respect to Poseidon. These changes decrease the number of multiplications in the linear layer by up to 90% and the number of constraints in Plonk circuits by up to 70%. This makes Poseidon2 currently the fastest arithmetization-oriented hash function without lookups.
+
+
+## Using Poseidon2
+
+ICICLE Poseidon2 is implemented for GPU and parallelization is performed for each state.
+We calculate multiple hash-sums over multiple pre-images in parallel, rather than going block by block over the input vector.
+
+For example, for Poseidon2 of width 16, input rate 8, output elements 8 and input of size 1024 * 8, we would expect 1024 * 8 elements of output. Which means each input block would be of size 8, resulting in 1024 Poseidon2 hashes being performed.
+
+### Supported Bindings
+
+[`Rust`](https://github.com/ingonyama-zk/icicle/tree/main/wrappers/rust/icicle-core/src/poseidon2)
+
+### Constants
+
+Poseidon2 is also extremely customizable and using different constants will produce different hashes, security levels and performance results.
+
+We support pre-calculated constants for each of the [supported curves](../core#supported-curves-and-operations). The constants can be found [here](https://github.com/ingonyama-zk/icicle/tree/main/icicle/include/poseidon2/constants) and are labeled clearly per curve `<curve_name>_poseidon2.h`.
+
+You can also use your own set of constants as shown [here](https://github.com/ingonyama-zk/icicle/blob/main/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs#L290)
+
+### Rust API
+
+This is the most basic way to use the Poseidon2 API.
+
+```rust
+let test_size = 1 << 10;
+let width = 16;
+let rate = 8;
+let ctx = get_default_device_context();
+let poseidon = Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
+let config = HashConfig::default();
+
+let inputs = vec![F::one(); test_size * rate as usize];
+let outputs = vec![F::zero(); test_size];
+let mut input_slice = HostOrDeviceSlice::on_host(inputs);
+let mut output_slice = HostOrDeviceSlice::on_host(outputs);
+
+poseidon.hash_many::<F>(
+    &mut input_slice,
+    &mut output_slice,
+    test_size as u32,
+    rate as u32,
+    8, // Output length
+    &config,
+)
+.unwrap();
+```
+
+In the example above `Poseidon2::load(width, rate, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();` is used to load the correct constants based on width and curve. Here, the default MDS matrices and diffusion are used. If you want to get a Plonky3 compliant version, set them to `MdsType::Plonky` and `DiffusionStrategy::Montgomery` respectively.
+
+## The Tree Builder
+
+Similar to Poseidon1, you can use Poseidon2 in a tree builder.
+
+```rust
+use icicle_bn254::tree::Bn254TreeBuilder;
+use icicle_bn254::poseidon2::Poseidon2;
+
+let mut config = TreeBuilderConfig::default();
+let arity = 2;
+config.arity = arity as u32;
+let input_block_len = arity;
+let leaves = vec![F::one(); (1 << height) * arity];
+let mut digests = vec![F::zero(); merkle_tree_digests_len((height + 1) as u32, arity as u32, 1)];
+
+let leaves_slice = HostSlice::from_slice(&leaves);
+let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+let ctx = device_context::DeviceContext::default();
+let hash = Poseidon2::load(arity, arity, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
+
+let mut config = TreeBuilderConfig::default();
+config.keep_rows = 5;
+Bn254TreeBuilder::build_merkle_tree(
+    leaves_slice,
+    digests_slice,
+    height,
+    input_block_len,
+    &hash,
+    &hash,
+    &config,
+)
+.unwrap();
+```
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -3680,6 +3680,8 @@
      "version": "8.12.0",
      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
      "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+      "optional": true,
+      "peer": true,
      "dependencies": {
        "fast-deep-equal": "^3.1.1",
        "json-schema-traverse": "^1.0.0",
@@ -3694,7 +3696,9 @@
    "node_modules/ajv-formats/node_modules/json-schema-traverse": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+      "optional": true,
+      "peer": true
    },
    "node_modules/ajv-keywords": {
      "version": "3.5.2",
@@ -16340,14 +16344,13 @@
      "version": "2.1.1",
      "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz",
      "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==",
-      "requires": {
-        "ajv": "^8.0.0"
-      },
+      "requires": {},
      "dependencies": {
        "ajv": {
-          "version": "8.12.0",
-          "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
+          "version": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz",
          "integrity": "sha512-sRu1kpcO9yLtYxBKvqfTeh9KzZEwO3STyX1HT+4CaDzC6HpTGYhIhPIzj9XuKU7KYDwnaeh5hcOwjy1QuJzBPA==",
+          "optional": true,
+          "peer": true,
          "requires": {
            "fast-deep-equal": "^3.1.1",
            "json-schema-traverse": "^1.0.0",
@@ -16358,7 +16361,9 @@
        "json-schema-traverse": {
          "version": "1.0.0",
          "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="
+          "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+          "optional": true,
+          "peer": true
        }
      }
    },
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -53,6 +53,11 @@ module.exports = {
              label: "Poseidon Hash",
              id: "icicle/primitives/poseidon",
            },
+            {
+              type: "doc",
+              label: "Poseidon2 Hash",
+              id: "icicle/primitives/poseidon2",
+            },
          ],
        },
        {
--- a/examples/c++/msm/CMakeLists.txt
+++ b/examples/c++/msm/CMakeLists.txt
@@ -19,5 +19,5 @@ add_executable(
  example.cu
 )
 target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bls12_377.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_curve_bn254.a)
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/msm/compile.sh
+++ b/examples/c++/msm/compile.sh
@@ -10,9 +10,6 @@ mkdir -p build/icicle
 cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DG2=ON
 cmake --build build/icicle

-cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377 -DG2=ON
-cmake --build build/icicle
-
 # Configure and build the example application
 cmake -S . -B build/example
 cmake --build build/example
--- a/examples/c++/msm/example.cu
+++ b/examples/c++/msm/example.cu
@@ -3,23 +3,9 @@
 #include <iomanip>

 #include "api/bn254.h"
-#include "api/bls12_377.h"
+using namespace bn254;

-// using namespace bn254;
-typedef bn254::scalar_t scalar_bn254;
-typedef bn254::affine_t affine_bn254;
-typedef bn254::g2_affine_t g2_affine_bn254;
-typedef bn254::projective_t projective_bn254;
-typedef bn254::g2_projective_t g2_projective_bn254;
-
-typedef bls12_377::scalar_t scalar_bls12377;
-typedef bls12_377::affine_t affine_bls12377;
-typedef bls12_377::g2_affine_t g2_affine_bls12377;
-typedef bls12_377::projective_t projective_bls12377;
-typedef bls12_377::g2_projective_t g2_projective_bls12377;
-
-
-int msm_bn254(int argc, char* argv[])
+int main(int argc, char* argv[])
 {
  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
  std::cout << "Example parameters" << std::endl;
@@ -32,11 +18,11 @@ int msm_bn254(int argc, char* argv[])
  std::cout << "Part I: use G1 points" << std::endl;

  std::cout << "Generating random inputs on-host" << std::endl;
-  scalar_bn254* scalars = new scalar_bn254[N];
-  affine_bn254* points = new affine_bn254[N];
-  projective_bn254 result;
-  scalar_bn254::rand_host_many(scalars, N);
-  projective_bn254::rand_host_many_affine(points, N);
+  scalar_t* scalars = new scalar_t[N];
+  affine_t* points = new affine_t[N];
+  projective_t result;
+  scalar_t::rand_host_many(scalars, N);
+  projective_t::rand_host_many_affine(points, N);

  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
  device_context::DeviceContext ctx = device_context::get_default_device_context();
@@ -62,17 +48,17 @@ int msm_bn254(int argc, char* argv[])
  cudaStream_t stream = config.ctx.stream;
  // Execute the MSM kernel
  bn254_msm_cuda(scalars, points, msm_size, config, &result);
-  std::cout << projective_bn254::to_affine(result) << std::endl;
+  std::cout << projective_t::to_affine(result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
-  scalar_bn254* scalars_d;
-  affine_bn254* points_d;
-  projective_bn254* result_d;
-  cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
-  cudaMalloc(&points_d, sizeof(affine_bn254) * N);
-  cudaMalloc(&result_d, sizeof(projective_bn254));
-  cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(points_d, points, sizeof(affine_bn254) * N, cudaMemcpyHostToDevice);
+  scalar_t* scalars_d;
+  affine_t* points_d;
+  projective_t* result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&points_d, sizeof(affine_t) * N);
+  cudaMalloc(&result_d, sizeof(projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(affine_t) * N, cudaMemcpyHostToDevice);

  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
  config.are_results_on_device = true;
@@ -84,9 +70,9 @@ int msm_bn254(int argc, char* argv[])
  bn254_msm_cuda(scalars_d, points_d, msm_size, config, result_d);

  // Copy the result back to the host
-  cudaMemcpy(&result, result_d, sizeof(projective_bn254), cudaMemcpyDeviceToHost);
+  cudaMemcpy(&result, result_d, sizeof(projective_t), cudaMemcpyDeviceToHost);
  // Print the result
-  std::cout << projective_bn254::to_affine(result) << std::endl;
+  std::cout << projective_t::to_affine(result) << std::endl;
  // Free the device memory
  cudaFree(scalars_d);
  cudaFree(points_d);
@@ -98,25 +84,25 @@ int msm_bn254(int argc, char* argv[])

  std::cout << "Generating random inputs on-host" << std::endl;
  // use the same scalars
-  g2_affine_bn254* g2_points = new g2_affine_bn254[N];
-  g2_projective_bn254::rand_host_many_affine(g2_points, N);
+  g2_affine_t* g2_points = new g2_affine_t[N];
+  g2_projective_t::rand_host_many_affine(g2_points, N);

  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
  config.are_results_on_device = false;
  config.are_scalars_on_device = false;
  config.are_points_on_device = false;
-  g2_projective_bn254 g2_result;
+  g2_projective_t g2_result;
  bn254_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
-  std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

  std::cout << "Copying inputs on-device" << std::endl;
-  g2_affine_bn254* g2_points_d;
-  g2_projective_bn254* g2_result_d;
-  cudaMalloc(&scalars_d, sizeof(scalar_bn254) * N);
-  cudaMalloc(&g2_points_d, sizeof(g2_affine_bn254) * N);
-  cudaMalloc(&g2_result_d, sizeof(g2_projective_bn254));
-  cudaMemcpy(scalars_d, scalars, sizeof(scalar_bn254) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bn254) * N, cudaMemcpyHostToDevice);
+  g2_affine_t* g2_points_d;
+  g2_projective_t* g2_result_d;
+  cudaMalloc(&scalars_d, sizeof(scalar_t) * N);
+  cudaMalloc(&g2_points_d, sizeof(g2_affine_t) * N);
+  cudaMalloc(&g2_result_d, sizeof(g2_projective_t));
+  cudaMemcpy(scalars_d, scalars, sizeof(scalar_t) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_t) * N, cudaMemcpyHostToDevice);

  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
  config.are_results_on_device = true;
@@ -125,140 +111,14 @@ int msm_bn254(int argc, char* argv[])

  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
  bn254_g2_msm_cuda(scalars_d, g2_points_d, msm_size, config, g2_result_d);
-  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
-  std::cout << g2_projective_bn254::to_affine(g2_result) << std::endl;
+  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_t), cudaMemcpyDeviceToHost);
+  std::cout << g2_projective_t::to_affine(g2_result) << std::endl;

  cudaFree(scalars_d);
  cudaFree(g2_points_d);
  cudaFree(g2_result_d);
  delete[] g2_points;
  delete[] scalars;
-  return 0;
-}
-
-int msm_bls12_377(int argc, char* argv[])
-{
-  std::cout << "Icicle example: Muli-Scalar Multiplication (MSM)" << std::endl;
-  std::cout << "Example parameters" << std::endl;
-  int batch_size = 1;
-  std::cout << "Batch size: " << batch_size << std::endl;
-  unsigned msm_size = 1048576;
-  std::cout << "MSM size: " << msm_size << std::endl;
-  int N = batch_size * msm_size;
-
-  std::cout << "Part I: use G1 points" << std::endl;
-
-  std::cout << "Generating random inputs on-host" << std::endl;
-  scalar_bls12377* scalars = new scalar_bls12377[N];
-  affine_bls12377* points = new affine_bls12377[N];
-  projective_bls12377 result;
-  scalar_bls12377::rand_host_many(scalars, N);
-  projective_bls12377::rand_host_many_affine(points, N);
-
-  std::cout << "Using default MSM configuration with on-host inputs" << std::endl;
-  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  msm::MSMConfig config = {
-    ctx,   // ctx
-    0,     // points_size
-    1,     // precompute_factor
-    0,     // c
-    0,     // bitsize
-    10,    // large_bucket_factor
-    1,     // batch_size
-    false, // are_scalars_on_device
-    false, // are_scalars_montgomery_form
-    false, // are_points_on_device
-    false, // are_points_montgomery_form
-    false, // are_results_on_device
-    false, // is_big_triangle
-    false, // is_async
-  };
-  config.batch_size = batch_size;
-
-  std::cout << "Running MSM kernel with on-host inputs" << std::endl;
-  cudaStream_t stream = config.ctx.stream;
-  cudaStreamCreate(&stream);
-  // Execute the MSM kernel
-  bls12_377_msm_cuda(scalars, points, msm_size, config, &result);
-  std::cout << projective_bls12377::to_affine(result) << std::endl;
-
-  std::cout << "Copying inputs on-device" << std::endl;
-  scalar_bls12377* scalars_d_bls;
-  affine_bls12377* points_d_bls;
-  projective_bls12377* result_d_bls;
-  cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
-  cudaMalloc(&points_d_bls, sizeof(affine_bls12377) * N);
-  cudaMalloc(&result_d_bls, sizeof(projective_bls12377));
-  cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(points_d_bls, points, sizeof(affine_bls12377) * N, cudaMemcpyHostToDevice);
-
-  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
-  config.are_results_on_device = true;
-  config.are_scalars_on_device = true;
-  config.are_points_on_device = true;
-
-  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
-  // Execute the MSM kernel
-  bls12_377_msm_cuda(scalars_d_bls, points_d_bls, msm_size, config, result_d_bls);
-
-  // Copy the result back to the host
-  cudaMemcpy(&result, result_d_bls, sizeof(projective_bls12377), cudaMemcpyDeviceToHost);
-  // Print the result
-  std::cout << projective_bls12377::to_affine(result) << std::endl;
-  // Free the device memory
-  cudaFree(scalars_d_bls);
-  cudaFree(points_d_bls);
-  cudaFree(result_d_bls);
-  // Free the host memory, keep scalars for G2 example
-  delete[] points;
-
-  std::cout << "Part II: use G2 points" << std::endl;
-
-  std::cout << "Generating random inputs on-host" << std::endl;
-  // use the same scalars
-  g2_affine_bls12377* g2_points = new g2_affine_bls12377[N];
-  g2_projective_bls12377::rand_host_many_affine(g2_points, N);
-
-  std::cout << "Reconfiguring MSM to use on-host inputs" << std::endl;
-  config.are_results_on_device = false;
-  config.are_scalars_on_device = false;
-  config.are_points_on_device = false;
-  g2_projective_bls12377 g2_result;
-  bls12_377_g2_msm_cuda(scalars, g2_points, msm_size, config, &g2_result);
-  std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
-
-  std::cout << "Copying inputs on-device" << std::endl;
-  g2_affine_bls12377* g2_points_d;
-  g2_projective_bls12377* g2_result_d;
-  cudaMalloc(&scalars_d_bls, sizeof(scalar_bls12377) * N);
-  cudaMalloc(&g2_points_d, sizeof(g2_affine_bls12377) * N);
-  cudaMalloc(&g2_result_d, sizeof(g2_projective_bls12377));
-  cudaMemcpy(scalars_d_bls, scalars, sizeof(scalar_bls12377) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(g2_points_d, g2_points, sizeof(g2_affine_bls12377) * N, cudaMemcpyHostToDevice);
-
-  std::cout << "Reconfiguring MSM to use on-device inputs" << std::endl;
-  config.are_results_on_device = true;
-  config.are_scalars_on_device = true;
-  config.are_points_on_device = true;
-
-  std::cout << "Running MSM kernel with on-device inputs" << std::endl;
-  bls12_377_g2_msm_cuda(scalars_d_bls, g2_points_d, msm_size, config, g2_result_d);
-  cudaMemcpy(&g2_result, g2_result_d, sizeof(g2_projective_bn254), cudaMemcpyDeviceToHost);
-  std::cout << g2_projective_bls12377::to_affine(g2_result) << std::endl;
-
-  cudaFree(scalars_d_bls);
-  cudaFree(g2_points_d);
-  cudaFree(g2_result_d);
-  delete[] g2_points;
-  delete[] scalars;
-  return 0;
-}
-
-int main(int argc, char* argv[])
-{ 
-  std::cout << "Starting BN254 MSM" << std::endl;
-  msm_bn254(argc, argv);
-  std::cout << "Starting BLS12-377 MSM" << std::endl;
-  msm_bls12_377(argc, argv);
+  cudaStreamDestroy(stream);
  return 0;
 }
--- a/examples/c++/multiply/CMakeLists.txt
+++ b/examples/c++/multiply/CMakeLists.txt
@@ -18,7 +18,7 @@ add_executable(
  example.cu
 )
 target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bls12_377.a)
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
 find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
 target_link_libraries(example ${NVML_LIBRARY})
 set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/examples/c++/multiply/compile.sh
+++ b/examples/c++/multiply/compile.sh
@@ -7,13 +7,9 @@ mkdir -p build/example
 mkdir -p build/icicle

 # Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bls12_377
-cmake --build build/icicle
-rm build/icicle/CMakeCache.txt 
 cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
 cmake --build build/icicle

-
 # Configure and build the example application
 cmake -S . -B build/example
 cmake --build build/example
--- a/examples/c++/multiply/example.cu
+++ b/examples/c++/multiply/example.cu
@@ -4,17 +4,14 @@
 #include <nvml.h>

 #include "api/bn254.h"
-#include "api/bls12_377.h"
 #include "vec_ops/vec_ops.cuh"

 using namespace vec_ops;
-// using namespace bn254;
-typedef bn254::scalar_t T;
+using namespace bn254;

-typedef  bls12_377::scalar_t T_bls;
+typedef scalar_t T;

-
-int vector_mult_bn254(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
+int vector_mult(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, device_context::DeviceContext ctx)
 {
  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
  config.is_a_on_device = true;
@@ -28,24 +25,10 @@ int vector_mult_bn254(T* vec_b, T* vec_a, T* vec_result, size_t n_elments, devic
  return 0;
 }

-int vector_mult_bls12377(T_bls* vec_b, T_bls* vec_a, T_bls* vec_result, size_t n_elments, device_context::DeviceContext ctx)
-{
-  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
-  config.is_a_on_device = true;
-  config.is_b_on_device = true;
-  config.is_result_on_device = true;
-  cudaError_t err = bls12_377_mul_cuda(vec_a, vec_b, n_elments, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-
 int main(int argc, char** argv)
 {
  const unsigned vector_size = 1 << 15;
-  const unsigned repetitions = 1 ;
+  const unsigned repetitions = 1 << 15;

  cudaError_t err;
  nvmlInit();
@@ -78,53 +61,41 @@ int main(int argc, char** argv)
  // host data
  T* host_in1 = (T*)malloc(vector_size * sizeof(T));
  T* host_in2 = (T*)malloc(vector_size * sizeof(T));
-  T_bls* host_in1_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
-  T_bls* host_in2_bls12377 = (T_bls*)malloc(vector_size * sizeof(T_bls));
  std::cout << "Initializing vectors with random data" << std::endl;
  T::rand_host_many(host_in1, vector_size);
  T::rand_host_many(host_in2, vector_size);
-  T_bls::rand_host_many(host_in1_bls12377, vector_size);
-  T_bls::rand_host_many(host_in2_bls12377, vector_size);
  // device data
  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  T* device_in1_bn254;
-  T* device_in2_bn254;
-  T* device_out_bn254;
-  T_bls* device_in1_bls12377;
-  T_bls* device_in2_bls12377;
-  T_bls* device_out_bls12377;
+  T* device_in1;
+  T* device_in2;
+  T* device_out;

-  err = cudaMalloc((void**)&device_in1_bn254, vector_size * sizeof(T));
-  err = cudaMalloc((void**)&device_in1_bls12377, vector_size * sizeof(T_bls));
+  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMalloc((void**)&device_in2_bn254, vector_size * sizeof(T));
-  err = cudaMalloc((void**)&device_in2_bls12377, vector_size * sizeof(T_bls));
+  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMalloc((void**)&device_out_bn254, vector_size * sizeof(T));
-  err = cudaMalloc((void**)&device_out_bls12377, vector_size * sizeof(T_bls));
+  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
  if (err != cudaSuccess) {
    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

  // copy from host to device
-  err = cudaMemcpy(device_in1_bn254, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-  err = cudaMemcpy(device_in1_bls12377, host_in1_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
  }

-  err = cudaMemcpy(device_in2_bn254, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
-  err = cudaMemcpy(device_in2_bls12377, host_in2_bls12377, vector_size * sizeof(T_bls), cudaMemcpyHostToDevice);
+  err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
    return 0;
@@ -133,10 +104,7 @@ int main(int argc, char** argv)
  std::cout << "Starting warm-up" << std::endl;
  // Warm-up loop
  for (int i = 0; i < repetitions; i++) {
-    std::cout << "bn254 mult" << std::endl;
-    vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
-    std::cout << "bls12-377 mult" << std::endl;
-    vector_mult_bls12377(device_in1_bls12377, device_in2_bls12377, device_out_bls12377, vector_size, ctx);
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
  }

  std::cout << "Starting benchmarking" << std::endl;
@@ -154,7 +122,7 @@ int main(int argc, char** argv)
  auto start_time = std::chrono::high_resolution_clock::now();
  // Benchmark loop
  for (int i = 0; i < repetitions; i++) {
-    vector_mult_bn254(device_in1_bn254, device_in2_bn254, device_out_bn254, vector_size, ctx);
+    vector_mult(device_in1, device_in2, device_out, vector_size, ctx);
  }
  auto end_time = std::chrono::high_resolution_clock::now();
  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
@@ -178,7 +146,7 @@ int main(int argc, char** argv)
  // Optional: validate multiplication
  T* host_out = (T*)malloc(vector_size * sizeof(T));

-  cudaMemcpy(host_out, device_out_bn254, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+  cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);

  // validate multiplication here...

@@ -186,9 +154,9 @@ int main(int argc, char** argv)
  free(host_in1);
  free(host_in2);
  free(host_out);
-  cudaFree(device_in1_bn254);
-  cudaFree(device_in2_bn254);
-  cudaFree(device_out_bn254);
+  cudaFree(device_in1);
+  cudaFree(device_in2);
+  cudaFree(device_out);
  nvmlShutdown();
  return 0;
 }
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -11,6 +11,17 @@
 using namespace hash;

 namespace keccak {
+  // Input rate in bytes
+  const int KECCAK_256_RATE = 136;
+  const int KECCAK_512_RATE = 72;
+
+  // Digest size in u64
+  const int KECCAK_256_DIGEST = 4;
+  const int KECCAK_512_DIGEST = 8;
+
+  // Number of state elements in u64
+  const int KECCAK_STATE_SIZE = 25;
+
  class Keccak : public Hasher<uint8_t, uint64_t>
  {
  public:
@@ -22,7 +33,7 @@ namespace keccak {
      unsigned int output_len,
      const device_context::DeviceContext& ctx) const override;

-    Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(25, 25, rate, 0) {}
+    Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(KECCAK_STATE_SIZE, KECCAK_STATE_SIZE, rate, 0) {}
  };
 } // namespace keccak

--- a/icicle/src/hash/keccak/extern.cu
+++ b/icicle/src/hash/keccak/extern.cu
@@ -11,13 +11,15 @@ namespace keccak {
  extern "C" cudaError_t
  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return Keccak(136).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 4, config);
+    return Keccak(KECCAK_256_RATE)
+      .hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_256_DIGEST, config);
  }

  extern "C" cudaError_t
  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return Keccak(72).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 8, config);
+    return Keccak(KECCAK_512_RATE)
+      .hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, KECCAK_512_DIGEST, config);
  }

  extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
@@ -27,7 +29,7 @@ namespace keccak {
    unsigned int input_block_len,
    const merkle_tree::TreeBuilderConfig& tree_config)
  {
-    Keccak keccak(136);
+    Keccak keccak(KECCAK_256_RATE);
    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
  }
@@ -39,7 +41,7 @@ namespace keccak {
    unsigned int input_block_len,
    const merkle_tree::TreeBuilderConfig& tree_config)
  {
-    Keccak keccak(72);
+    Keccak keccak(KECCAK_512_RATE);
    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
  }
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -161,7 +161,7 @@ namespace keccak {
                                 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
                                 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};

-  __device__ void keccakf(u64 s[25])
+  __device__ void keccakf(u64 s[KECCAK_STATE_SIZE])
  {
    u64 t0, t1, t2, t3, t4;

@@ -188,7 +188,7 @@ namespace keccak {

    const uint8_t* b_input = input + sid * input_block_size;
    uint64_t* b_output = output + sid * output_len;
-    uint64_t state[25] = {}; // Initialize with zeroes
+    uint64_t state[KECCAK_STATE_SIZE] = {}; // Initialize with zeroes

    int input_len = input_block_size;

@@ -238,12 +238,12 @@ namespace keccak {
    int number_of_gpu_blocks = (number_of_states - 1) / number_of_threads + 1;

    switch (rate) {
-    case 136:
-      keccak_hash_blocks<136><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
+    case KECCAK_256_RATE:
+      keccak_hash_blocks<KECCAK_256_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
        input, input_len, output_len, number_of_states, output);
      break;
-    case 72:
-      keccak_hash_blocks<72><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
+    case KECCAK_512_RATE:
+      keccak_hash_blocks<KECCAK_512_RATE><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
        input, input_len, output_len, number_of_states, output);
      break;
    default:
--- a/wrappers/rust/Cargo.toml
+++ b/wrappers/rust/Cargo.toml
@@ -18,7 +18,7 @@ exclude = [
 ]

 [workspace.package]
-version = "2.7.1"
+version = "2.8.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 homepage = "https://www.ingonyama.com"
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/tree/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/tree/mod.rs
@@ -24,6 +24,6 @@ pub(crate) mod tests {
        let ctx = device_context::DeviceContext::default();
        let sponge = Poseidon::load(2, &ctx).unwrap();

-        check_build_field_merkle_tree::<_, _, Bls12_377TreeBuilder>(25, 2, &sponge, &sponge, ScalarField::zero());
+        check_build_field_merkle_tree::<_, _, Bls12_377TreeBuilder>(18, 2, &sponge, &sponge, ScalarField::zero());
    }
 }
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/tree/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/tree/mod.rs
@@ -24,6 +24,6 @@ pub(crate) mod tests {
        let ctx = device_context::DeviceContext::default();
        let sponge = Poseidon::load(2, &ctx).unwrap();

-        check_build_field_merkle_tree::<_, _, Bls12_381TreeBuilder>(25, 2, &sponge, &sponge, ScalarField::zero());
+        check_build_field_merkle_tree::<_, _, Bls12_381TreeBuilder>(18, 2, &sponge, &sponge, ScalarField::zero());
    }
 }