add keccak tree builder (#555)

2026-01-07 22:53:56 -05:00 · 2024-07-15 15:31:12 +07:00
parent 7fd9ed1b49
commit ea71faf1fa
74 changed files with 777 additions and 642 deletions
--- a/docs/docs/icicle/golang-bindings/keccak.md
+++ b/docs/docs/icicle/golang-bindings/keccak.md
@@ -25,7 +25,7 @@ func main() {
 	input := createHostSliceFromHexString("1725b6")
 	outHost256 := make(core.HostSlice[uint8], 32)

-	cfg := keccak.GetDefaultKeccakConfig()
+	cfg := keccak.GetDefaultHashConfig()
 	e := keccak.Keccak256(input, int32(input.Len()), 1, outHost256, &cfg)
 	if e.CudaErrorCode != cr.CudaSuccess {
 		panic("Keccak256 hashing failed")
@@ -49,8 +49,8 @@ func main() {
 ## Keccak Methods

 ```go
-func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
-func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError
+func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
+func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError
 ```

 ### Parameters
@@ -59,18 +59,18 @@ func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int3
 - **`inputBlockSize`**: An integer specifying the size of the input data for a single hash.
 - **`numberOfBlocks`**: An integer specifying the number of results in the hash batch.
 - **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
+- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.

 ### Return Value

 - **`CudaError`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.

-## KeccakConfig
+## HashConfig

-The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.

 ```go
-type KeccakConfig struct {
+type HashConfig struct {
 	Ctx                cr.DeviceContext
 	areInputsOnDevice  bool
 	areOutputsOnDevice bool
@@ -87,8 +87,8 @@ type KeccakConfig struct {

 ### Default Configuration

-Use `GetDefaultKeccakConfig` to obtain a default configuration, which can then be customized as needed.
+Use `GetDefaultHashConfig` to obtain a default configuration, which can then be customized as needed.

 ```go
-func GetDefaultKeccakConfig() KeccakConfig
+func GetDefaultHashConfig() HashConfig
 ```
--- a/docs/docs/icicle/rust-bindings/keccak.md
+++ b/docs/docs/icicle/rust-bindings/keccak.md
@@ -4,7 +4,7 @@

 ```rust
 use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
-use icicle_hash::keccak::{keccak256, KeccakConfig};
+use icicle_hash::keccak::{keccak256, HashConfig};
 use rand::{self, Rng};

 fn main() {
@@ -14,7 +14,7 @@ fn main() {
    let input = HostSlice::<u8>::from_slice(initial_data.as_slice());
    let mut output = DeviceVec::<u8>::cuda_malloc(32).unwrap();

-    let mut config = KeccakConfig::default();
+    let mut config = HashConfig::default();
    keccak256(input, initial_data.len() as i32, 1, &mut output[..], &mut config).expect("Failed to execute keccak256 hashing");

    let mut output_host = vec![0_u8; 32];
@@ -32,7 +32,7 @@ pub fn keccak256(
    input_block_size: i32,
    number_of_blocks: i32,
    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
-    config: &mut KeccakConfig,
+    config: &mut HashConfig,
 ) -> IcicleResult<()>

 pub fn keccak512(
@@ -40,7 +40,7 @@ pub fn keccak512(
    input_block_size: i32,
    number_of_blocks: i32,
    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
-    config: &mut KeccakConfig,
+    config: &mut HashConfig,
 ) -> IcicleResult<()> 
 ```

@@ -50,18 +50,18 @@ pub fn keccak512(
 - **`input_block_size`**: An integer specifying the size of the input data for a single hash.
 - **`number_of_blocks`**: An integer specifying the number of results in the hash batch.
 - **`output`**: A slice where the resulting hash will be stored. This slice can be in host or device memory.
- **`config`**: A pointer to a `KeccakConfig` object, which contains various configuration options for the Keccak256 operation.
+- **`config`**: A pointer to a `HashConfig` object, which contains various configuration options for the Keccak256 operation.

 ### Return Value

 - **`IcicleResult`**: Returns a CUDA error code indicating the success or failure of the Keccak256/Keccak512 operation.

-## KeccakConfig
+## HashConfig

-The `KeccakConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.
+The `HashConfig` structure holds configuration parameters for the Keccak256/Keccak512 operation, allowing customization of its behavior to optimize performance based on the specifics of the operation or the underlying hardware.

 ```rust
-pub struct KeccakConfig<'a> {
+pub struct HashConfig<'a> {
    pub ctx: DeviceContext<'a>,
    pub are_inputs_on_device: bool,
    pub are_outputs_on_device: bool,
@@ -81,7 +81,7 @@ pub struct KeccakConfig<'a> {
 Example initialization with default settings:

 ```rust
-let default_config = KeccakConfig::default();
+let default_config = HashConfig::default();
 ```

 Customizing the configuration:
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -35,7 +35,7 @@ void threadPoseidon(
    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
    return;
  }
-  SpongeConfig column_config = default_sponge_config(ctx);
+  HashConfig column_config = default_hash_config(ctx);
  cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
  checkCudaError(err);
 }
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -16,7 +16,7 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level

 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, SpongeConfig &config)
+  const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, HashConfig &config)
 {
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
@@ -67,7 +67,7 @@ uint32_t validate_proof(
  const uint32_t* proof_lr,
  const scalar_t* proof_hash,
  Poseidon<scalar_t> &poseidon,
-  SpongeConfig &config)
+  HashConfig &config)
 {
  scalar_t hashes_in[2], hash_out[1], level_hash;
  level_hash = hash;
@@ -112,12 +112,12 @@ int main(int argc, char* argv[])
  std::cout << "Hashing blocks into tree leaves..." << std::endl;

  Poseidon<scalar_t> poseidon(data_arity, ctx);
-  SpongeConfig config = default_sponge_config(ctx); 
+  HashConfig config = default_hash_config(ctx); 
  poseidon.hash_many(data, &tree[tree_index(leaf_level, 0)], tree_width, data_arity, 1, config);

  std::cout << "3. Building Merkle tree" << std::endl;
  Poseidon<scalar_t> tree_poseidon(tree_arity, ctx);
-  SpongeConfig tree_config = default_sponge_config(ctx);
+  HashConfig tree_config = default_hash_config(ctx);
  build_tree(tree_height, tree, tree_poseidon, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -2,7 +2,7 @@ use icicle_bls12_381::curve::ScalarField as F;

 use icicle_cuda_runtime::device_context::DeviceContext;

-use icicle_core::hash::{SpongeHash, SpongeConfig};
+use icicle_core::hash::{SpongeHash, HashConfig};
 use icicle_core::poseidon::Poseidon;
 use icicle_core::traits::FieldImpl;
 use icicle_cuda_runtime::memory::HostSlice;
@@ -32,7 +32,7 @@ fn main() {
    );
    let ctx = DeviceContext::default();
    let poseidon = Poseidon::load(arity, &ctx).unwrap();
-    let config = SpongeConfig::default();
+    let config = HashConfig::default();

    println!(
        "---------------------- Input size 2^{}={} ------------------------",
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -49,7 +49,7 @@ extern "C" cudaError_t babybear_poseidon2_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  babybear_poseidon2_delete_cuda(poseidon2::Poseidon2<babybear::scalar_t>* poseidon, device_context::DeviceContext& ctx);
@@ -59,16 +59,16 @@ extern "C" cudaError_t babybear_build_merkle_tree(
  babybear::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
-  const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
+  const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
+  const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t babybear_mmcs_commit_cuda(
    const matrix::Matrix<babybear::scalar_t>* leaves,
    unsigned int number_of_inputs,
    babybear::scalar_t* digests,
-    const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* hasher,
-    const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
+    const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* hasher,
+    const hash::Hasher<babybear::scalar_t, babybear::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t babybear_mul_cuda(
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -71,16 +71,16 @@ extern "C" cudaError_t bls12_377_build_merkle_tree(
  bls12_377::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
-  const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
+  const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+  const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t bls12_377_mmcs_commit_cuda(
    const matrix::Matrix<bls12_377::scalar_t>* leaves,
    unsigned int number_of_inputs,
    bls12_377::scalar_t* digests,
-    const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
-    const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+    const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
+    const hash::Hasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t bls12_377_poseidon_create_cuda(
@@ -108,7 +108,7 @@ extern "C" cudaError_t bls12_377_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  bls12_377_poseidon_delete_cuda(poseidon::Poseidon<bls12_377::scalar_t>* poseidon);
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -71,16 +71,16 @@ extern "C" cudaError_t bls12_381_build_merkle_tree(
  bls12_381::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
-  const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
+  const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+  const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t bls12_381_mmcs_commit_cuda(
    const matrix::Matrix<bls12_381::scalar_t>* leaves,
    unsigned int number_of_inputs,
    bls12_381::scalar_t* digests,
-    const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
-    const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+    const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
+    const hash::Hasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t bls12_381_poseidon_create_cuda(
@@ -108,7 +108,7 @@ extern "C" cudaError_t bls12_381_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  bls12_381_poseidon_delete_cuda(poseidon::Poseidon<bls12_381::scalar_t>* poseidon);
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -97,7 +97,7 @@ extern "C" cudaError_t bn254_poseidon2_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  bn254_poseidon2_delete_cuda(poseidon2::Poseidon2<bn254::scalar_t>* poseidon, device_context::DeviceContext& ctx);
@@ -107,16 +107,16 @@ extern "C" cudaError_t bn254_build_merkle_tree(
  bn254::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
-  const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
+  const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
+  const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t bn254_mmcs_commit_cuda(
    const matrix::Matrix<bn254::scalar_t>* leaves,
    unsigned int number_of_inputs,
    bn254::scalar_t* digests,
-    const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* hasher,
-    const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
+    const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* hasher,
+    const hash::Hasher<bn254::scalar_t, bn254::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t bn254_poseidon_create_cuda(
@@ -144,7 +144,7 @@ extern "C" cudaError_t bn254_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  bn254_poseidon_delete_cuda(poseidon::Poseidon<bn254::scalar_t>* poseidon);
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -71,16 +71,16 @@ extern "C" cudaError_t bw6_761_build_merkle_tree(
  bw6_761::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
-  const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
+  const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+  const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t bw6_761_mmcs_commit_cuda(
    const matrix::Matrix<bw6_761::scalar_t>* leaves,
    unsigned int number_of_inputs,
    bw6_761::scalar_t* digests,
-    const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
-    const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+    const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
+    const hash::Hasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t bw6_761_poseidon_create_cuda(
@@ -108,7 +108,7 @@ extern "C" cudaError_t bw6_761_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  bw6_761_poseidon_delete_cuda(poseidon::Poseidon<bw6_761::scalar_t>* poseidon);
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -44,16 +44,16 @@ extern "C" cudaError_t grumpkin_build_merkle_tree(
  grumpkin::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
-  const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
+  const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+  const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t grumpkin_mmcs_commit_cuda(
    const matrix::Matrix<grumpkin::scalar_t>* leaves,
    unsigned int number_of_inputs,
    grumpkin::scalar_t* digests,
-    const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
-    const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+    const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
+    const hash::Hasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t grumpkin_poseidon_create_cuda(
@@ -81,7 +81,7 @@ extern "C" cudaError_t grumpkin_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  grumpkin_poseidon_delete_cuda(poseidon::Poseidon<grumpkin::scalar_t>* poseidon);
--- a/icicle/include/api/hash.h
+++ b/icicle/include/api/hash.h
@@ -6,11 +6,25 @@
 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
 #include "hash/keccak/keccak.cuh"
+#include "merkle-tree/merkle.cuh"

 extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);

 extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::KeccakConfig& config);
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);

+extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
+  const uint8_t* leaves,
+  uint64_t* digests,
+  unsigned int height,
+  unsigned int input_block_len,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t build_keccak512_merkle_tree_cuda(
+  const uint8_t* leaves,
+  uint64_t* digests,
+  unsigned int height,
+  unsigned int input_block_len,
+  const merkle_tree::TreeBuilderConfig& tree_config);
 #endif
--- a/icicle/include/api/m31.h
+++ b/icicle/include/api/m31.h
@@ -19,16 +19,16 @@ extern "C" cudaError_t m31_build_merkle_tree(
  m31::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
-  const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
+  const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
+  const hash::Hasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t m31_mmcs_commit_cuda(
    const matrix::Matrix<m31::scalar_t>* leaves,
    unsigned int number_of_inputs,
    m31::scalar_t* digests,
-    const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* hasher,
-    const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
+    const hash::Hasher<m31::scalar_t, m31::scalar_t>* hasher,
+    const hash::Hasher<m31::scalar_t, m31::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t m31_mul_cuda(
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -20,16 +20,16 @@ extern "C" cudaError_t stark252_build_merkle_tree(
  stark252::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
-  const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
+  const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
+  const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t stark252_mmcs_commit_cuda(
    const matrix::Matrix<stark252::scalar_t>* leaves,
    unsigned int number_of_inputs,
    stark252::scalar_t* digests,
-    const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* hasher,
-    const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
+    const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* hasher,
+    const hash::Hasher<stark252::scalar_t, stark252::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t stark252_mul_cuda(
--- a/icicle/include/api/templates/fields/poseidon.h
+++ b/icicle/include/api/templates/fields/poseidon.h
@@ -23,7 +23,7 @@ extern "C" cudaError_t ${FIELD}_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  ${FIELD}_poseidon_delete_cuda(poseidon::Poseidon<${FIELD}::scalar_t>* poseidon);
--- a/icicle/include/api/templates/fields/poseidon2.h
+++ b/icicle/include/api/templates/fields/poseidon2.h
@@ -28,7 +28,7 @@ extern "C" cudaError_t ${FIELD}_poseidon2_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  hash::SpongeConfig& cfg);
+  hash::HashConfig& cfg);

 extern "C" cudaError_t
  ${FIELD}_poseidon2_delete_cuda(poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon, device_context::DeviceContext& ctx);
--- a/icicle/include/api/templates/fields/tree.h
+++ b/icicle/include/api/templates/fields/tree.h
@@ -3,14 +3,14 @@ extern "C" cudaError_t ${FIELD}_build_merkle_tree(
  ${FIELD}::scalar_t* digests,
  unsigned int height,
  unsigned int input_block_len, 
-  const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
-  const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
+  const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+  const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
  const merkle_tree::TreeBuilderConfig& tree_config);

  extern "C" cudaError_t ${FIELD}_mmcs_commit_cuda(
    const matrix::Matrix<${FIELD}::scalar_t>* leaves,
    unsigned int number_of_inputs,
    ${FIELD}::scalar_t* digests,
-    const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
-    const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+    const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
+    const hash::Hasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
    const merkle_tree::TreeBuilderConfig& tree_config);
--- a/icicle/include/hash/hash.cuh
+++ b/icicle/include/hash/hash.cuh
@@ -16,10 +16,10 @@ using matrix::Matrix;
 namespace hash {

  /**
-   * @struct SpongeConfig
-   * Encodes sponge hash operations parameters.
+   * @struct HashConfig
+   * Encodes hash operations parameters.
   */
-  struct SpongeConfig {
+  struct HashConfig {
    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
    bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
    bool
@@ -31,14 +31,14 @@ namespace hash {
  };

  /**
-   * A function that returns the default value of [SpongeConfig](@ref SpongeConfig) for the [SpongeHasher](@ref
-   * SpongeHasher) class.
-   * @return Default value of [SpongeConfig](@ref SpongeConfig).
+   * A function that returns the default value of [HashConfig](@ref HashConfig) for the [Hasher](@ref
+   * Hasher) class.
+   * @return Default value of [HashConfig](@ref HashConfig).
   */
-  static SpongeConfig
-  default_sponge_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  static HashConfig
+  default_hash_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
  {
-    SpongeConfig config = {
+    HashConfig config = {
      ctx,   // ctx
      false, // are_inputs_on_device
      false, // are_outputs_on_device
@@ -48,16 +48,15 @@ namespace hash {
  }

  /**
-   * @class SpongeHasher
+   * @class Hasher
   *
-   * Can be inherited by a cryptographic permutation function to create a
-   * [sponge](https://en.wikipedia.org/wiki/Sponge_function) construction out of it.
+   * An interface containing methods for hashing
   *
   * @tparam PreImage type of inputs elements
   * @tparam Image type of state elements. Also used to describe the type of hash output
   */
  template <typename PreImage, typename Image>
-  class SpongeHasher
+  class Hasher
  {
  public:
    /// @brief the width of permutation state
@@ -72,7 +71,7 @@ namespace hash {
    /// @brief start squeezing from this offset. Used with domain separation.
    const unsigned int offset;

-    SpongeHasher(unsigned int width, unsigned int preimage_max_length, unsigned int rate, unsigned int offset)
+    Hasher(unsigned int width, unsigned int preimage_max_length, unsigned int rate, unsigned int offset)
        : width(width), preimage_max_length(preimage_max_length), rate(rate), offset(offset)
    {
      assert(
@@ -105,7 +104,6 @@ namespace hash {
      return cudaError_t::cudaSuccess;
    }

-    /// @brief Permute aligned input and do squeeze
    /// @param input pointer to input allocated on-device
    /// @param out pointer to output allocated on-device
    cudaError_t compress_many(
@@ -113,7 +111,7 @@ namespace hash {
      Image* out,
      unsigned int number_of_states,
      unsigned int output_len,
-      const SpongeConfig& cfg) const
+      const HashConfig& cfg) const
    {
      return hash_many((const PreImage*)input, out, number_of_states, width, output_len, cfg);
    }
@@ -136,7 +134,7 @@ namespace hash {
      unsigned int number_of_states,
      unsigned int input_len,
      unsigned int output_len,
-      const SpongeConfig& cfg) const
+      const HashConfig& cfg) const
    {
      const PreImage* d_input;
      PreImage* d_alloc_input;
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -11,31 +11,19 @@
 using namespace hash;

 namespace keccak {
-  /**
-   * @struct KeccakConfig
-   * Struct that encodes various Keccak parameters.
-   */
-  struct KeccakConfig {
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
-    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
-    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
-    bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
-                    *   non-blocking and you'd need to synchronize it explicitly by running
-                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
-                    *   function will block the current CPU thread. */
-  };
-
-  KeccakConfig default_keccak_config()
+  class Keccak : public Hasher<uint8_t, uint64_t>
  {
-    device_context::DeviceContext ctx = device_context::get_default_device_context();
-    KeccakConfig config = {
-      ctx,   // ctx
-      false, // are_inputes_on_device
-      false, // are_outputs_on_device
-      false, // is_async
-    };
-    return config;
-  }
+  public:
+    cudaError_t run_hash_many_kernel(
+      const uint8_t* input,
+      uint64_t* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const override;
+
+    Keccak(unsigned int rate) : Hasher<uint8_t, uint64_t>(25, 25, rate, 0) {}
+  };
 } // namespace keccak

 #endif
--- a/icicle/include/merkle-tree/merkle.cuh
+++ b/icicle/include/merkle-tree/merkle.cuh
@@ -111,8 +111,10 @@ namespace merkle_tree {
  cudaError_t build_merkle_tree(
    const Leaf* inputs,
    Digest* digests,
-    const SpongeHasher<Leaf, Digest>& compression,
-    const SpongeHasher<Leaf, Digest>& bottom_layer,
+    unsigned int height,
+    unsigned int input_block_len,
+    const Hasher<Leaf, Digest>& compression,
+    const Hasher<Leaf, Digest>& bottom_layer,
    const TreeBuilderConfig& config);

  template <typename Leaf, typename Digest>
@@ -120,8 +122,8 @@ namespace merkle_tree {
    const Matrix<Leaf>* inputs,
    const unsigned int number_of_inputs,
    Digest* digests,
-    const SpongeHasher<Leaf, Digest>& hasher,
-    const SpongeHasher<Leaf, Digest>& compression,
+    const Hasher<Leaf, Digest>& hasher,
+    const Hasher<Leaf, Digest>& compression,
    const TreeBuilderConfig& tree_config);
 } // namespace merkle_tree

--- a/icicle/include/poseidon/poseidon.cuh
+++ b/icicle/include/poseidon/poseidon.cuh
@@ -20,7 +20,7 @@ using namespace hash;
 */
 namespace poseidon {
  template <typename S>
-  class Poseidon : public SpongeHasher<S, S>
+  class Poseidon : public Hasher<S, S>
  {
  public:
    const std::size_t device_id;
@@ -65,7 +65,7 @@ namespace poseidon {
      const S* sparse_matrices,
      const S domain_tag,
      device_context::DeviceContext& ctx)
-        : SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
+        : Hasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
    {
      PoseidonConstants<S> constants;
      CHK_STICKY(create_optimized_poseidon_constants(
@@ -75,7 +75,7 @@ namespace poseidon {
    }

    Poseidon(int arity, device_context::DeviceContext& ctx)
-        : SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
+        : Hasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
    {
      PoseidonConstants<S> constants{};
      CHK_STICKY(init_optimized_poseidon_constants(arity, ctx, &constants));
--- a/icicle/include/poseidon2/poseidon2.cuh
+++ b/icicle/include/poseidon2/poseidon2.cuh
@@ -23,7 +23,7 @@ using matrix::Matrix;
 */
 namespace poseidon2 {
  template <typename S>
-  class Poseidon2 : public hash::SpongeHasher<S, S>
+  class Poseidon2 : public hash::Hasher<S, S>
  {
    static const int POSEIDON_BLOCK_SIZE = 32;

@@ -144,7 +144,7 @@ namespace poseidon2 {
      MdsType mds_type,
      DiffusionStrategy diffusion,
      device_context::DeviceContext& ctx)
-        : hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
+        : hash::Hasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
    {
      Poseidon2Constants<S> constants;
      CHK_STICKY(create_poseidon2_constants(
@@ -159,7 +159,7 @@ namespace poseidon2 {
      MdsType mds_type,
      DiffusionStrategy diffusion,
      device_context::DeviceContext& ctx)
-        : hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
+        : hash::Hasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
    {
      Poseidon2Constants<S> constants;
      CHK_STICKY(init_poseidon2_constants(width, mds_type, diffusion, ctx, &constants));
--- a/icicle/src/hash/keccak/.gitignore
+++ b/icicle/src/hash/keccak/.gitignore
@@ -1 +1,2 @@
-test_keccak
+test_keccak
+test_keccak_tree
--- a/icicle/src/hash/keccak/Makefile
+++ b/icicle/src/hash/keccak/Makefile
@@ -1,6 +1,10 @@
+test_keccak_tree: test_tree.cu keccak.cu ../../merkle-tree/merkle.cu
+	nvcc -DMERKLE_DEBUG -o test_keccak_tree -I../../../include test_tree.cu
+	./test_keccak_tree
+
 test_keccak: test.cu keccak.cu 
 	nvcc -o test_keccak -I../../../include test.cu
 	./test_keccak

 clear:
-	rm test_keccak
+	rm test_keccak test_keccak_tree
--- a/icicle/src/hash/keccak/extern.cu
+++ b/icicle/src/hash/keccak/extern.cu
@@ -4,17 +4,44 @@
 #include "hash/hash.cuh"
 #include "hash/keccak/keccak.cuh"
 #include "keccak.cu"
+#include "../../merkle-tree/merkle.cu"
+#include "merkle-tree/merkle.cuh"

 namespace keccak {
  extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
+    return Keccak(136).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 4, config);
  }

  extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig& config)
  {
-    return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
+    return Keccak(72).hash_many(input, (uint64_t*)output, number_of_blocks, input_block_size, 8, config);
  }
+
+  extern "C" cudaError_t build_keccak256_merkle_tree_cuda(
+    const uint8_t* leaves,
+    uint64_t* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const merkle_tree::TreeBuilderConfig& tree_config)
+  {
+    Keccak keccak(136);
+    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
+      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
+  }
+
+  extern "C" cudaError_t build_keccak512_merkle_tree_cuda(
+    const uint8_t* leaves,
+    uint64_t* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const merkle_tree::TreeBuilderConfig& tree_config)
+  {
+    Keccak keccak(72);
+    return merkle_tree::build_merkle_tree<uint8_t, uint64_t>(
+      leaves, digests, height, input_block_len, keccak, keccak, tree_config);
+  }
+
 } // namespace keccak
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -1,50 +1,256 @@
 #include <cstdint>
 #include "gpu-utils/device_context.cuh"
 #include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"

 #include "hash/hash.cuh"
 #include "hash/keccak/keccak.cuh"
-#include "kernels.cu"

 using namespace hash;

 namespace keccak {
-  template <int C, int D>
-  cudaError_t
-  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  using u64 = uint64_t;
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)                                                              \
+  {                                                                                                                    \
+    t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4);                                                \
+  }
+
+#define THETA(                                                                                                         \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14);                                                      \
+    TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24);                                                      \
+    TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34);                                                      \
+    TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44);                                                      \
+    TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04);                                                      \
+    s00 ^= t0;                                                                                                         \
+    s01 ^= t0;                                                                                                         \
+    s02 ^= t0;                                                                                                         \
+    s03 ^= t0;                                                                                                         \
+    s04 ^= t0;                                                                                                         \
+                                                                                                                       \
+    s10 ^= t1;                                                                                                         \
+    s11 ^= t1;                                                                                                         \
+    s12 ^= t1;                                                                                                         \
+    s13 ^= t1;                                                                                                         \
+    s14 ^= t1;                                                                                                         \
+                                                                                                                       \
+    s20 ^= t2;                                                                                                         \
+    s21 ^= t2;                                                                                                         \
+    s22 ^= t2;                                                                                                         \
+    s23 ^= t2;                                                                                                         \
+    s24 ^= t2;                                                                                                         \
+                                                                                                                       \
+    s30 ^= t3;                                                                                                         \
+    s31 ^= t3;                                                                                                         \
+    s32 ^= t3;                                                                                                         \
+    s33 ^= t3;                                                                                                         \
+    s34 ^= t3;                                                                                                         \
+                                                                                                                       \
+    s40 ^= t4;                                                                                                         \
+    s41 ^= t4;                                                                                                         \
+    s42 ^= t4;                                                                                                         \
+    s43 ^= t4;                                                                                                         \
+    s44 ^= t4;                                                                                                         \
+  }
+
+#define RHOPI(                                                                                                         \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    t0 = ROTL64(s10, (uint64_t)1);                                                                                     \
+    s10 = ROTL64(s11, (uint64_t)44);                                                                                   \
+    s11 = ROTL64(s41, (uint64_t)20);                                                                                   \
+    s41 = ROTL64(s24, (uint64_t)61);                                                                                   \
+    s24 = ROTL64(s42, (uint64_t)39);                                                                                   \
+    s42 = ROTL64(s04, (uint64_t)18);                                                                                   \
+    s04 = ROTL64(s20, (uint64_t)62);                                                                                   \
+    s20 = ROTL64(s22, (uint64_t)43);                                                                                   \
+    s22 = ROTL64(s32, (uint64_t)25);                                                                                   \
+    s32 = ROTL64(s43, (uint64_t)8);                                                                                    \
+    s43 = ROTL64(s34, (uint64_t)56);                                                                                   \
+    s34 = ROTL64(s03, (uint64_t)41);                                                                                   \
+    s03 = ROTL64(s40, (uint64_t)27);                                                                                   \
+    s40 = ROTL64(s44, (uint64_t)14);                                                                                   \
+    s44 = ROTL64(s14, (uint64_t)2);                                                                                    \
+    s14 = ROTL64(s31, (uint64_t)55);                                                                                   \
+    s31 = ROTL64(s13, (uint64_t)45);                                                                                   \
+    s13 = ROTL64(s01, (uint64_t)36);                                                                                   \
+    s01 = ROTL64(s30, (uint64_t)28);                                                                                   \
+    s30 = ROTL64(s33, (uint64_t)21);                                                                                   \
+    s33 = ROTL64(s23, (uint64_t)15);                                                                                   \
+    s23 = ROTL64(s12, (uint64_t)10);                                                                                   \
+    s12 = ROTL64(s21, (uint64_t)6);                                                                                    \
+    s21 = ROTL64(s02, (uint64_t)3);                                                                                    \
+    s02 = t0;                                                                                                          \
+  }
+
+#define KHI(                                                                                                           \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    t0 = s00 ^ (~s10 & s20);                                                                                           \
+    t1 = s10 ^ (~s20 & s30);                                                                                           \
+    t2 = s20 ^ (~s30 & s40);                                                                                           \
+    t3 = s30 ^ (~s40 & s00);                                                                                           \
+    t4 = s40 ^ (~s00 & s10);                                                                                           \
+    s00 = t0;                                                                                                          \
+    s10 = t1;                                                                                                          \
+    s20 = t2;                                                                                                          \
+    s30 = t3;                                                                                                          \
+    s40 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s01 ^ (~s11 & s21);                                                                                           \
+    t1 = s11 ^ (~s21 & s31);                                                                                           \
+    t2 = s21 ^ (~s31 & s41);                                                                                           \
+    t3 = s31 ^ (~s41 & s01);                                                                                           \
+    t4 = s41 ^ (~s01 & s11);                                                                                           \
+    s01 = t0;                                                                                                          \
+    s11 = t1;                                                                                                          \
+    s21 = t2;                                                                                                          \
+    s31 = t3;                                                                                                          \
+    s41 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s02 ^ (~s12 & s22);                                                                                           \
+    t1 = s12 ^ (~s22 & s32);                                                                                           \
+    t2 = s22 ^ (~s32 & s42);                                                                                           \
+    t3 = s32 ^ (~s42 & s02);                                                                                           \
+    t4 = s42 ^ (~s02 & s12);                                                                                           \
+    s02 = t0;                                                                                                          \
+    s12 = t1;                                                                                                          \
+    s22 = t2;                                                                                                          \
+    s32 = t3;                                                                                                          \
+    s42 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s03 ^ (~s13 & s23);                                                                                           \
+    t1 = s13 ^ (~s23 & s33);                                                                                           \
+    t2 = s23 ^ (~s33 & s43);                                                                                           \
+    t3 = s33 ^ (~s43 & s03);                                                                                           \
+    t4 = s43 ^ (~s03 & s13);                                                                                           \
+    s03 = t0;                                                                                                          \
+    s13 = t1;                                                                                                          \
+    s23 = t2;                                                                                                          \
+    s33 = t3;                                                                                                          \
+    s43 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s04 ^ (~s14 & s24);                                                                                           \
+    t1 = s14 ^ (~s24 & s34);                                                                                           \
+    t2 = s24 ^ (~s34 & s44);                                                                                           \
+    t3 = s34 ^ (~s44 & s04);                                                                                           \
+    t4 = s44 ^ (~s04 & s14);                                                                                           \
+    s04 = t0;                                                                                                          \
+    s14 = t1;                                                                                                          \
+    s24 = t2;                                                                                                          \
+    s34 = t3;                                                                                                          \
+    s44 = t4;                                                                                                          \
+  }
+
+#define IOTA(element, rc)                                                                                              \
+  {                                                                                                                    \
+    element ^= rc;                                                                                                     \
+  }
+
+  __device__ const u64 RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
+                                 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
+                                 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+                                 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
+                                 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
+                                 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
+
+  __device__ void keccakf(u64 s[25])
  {
-    CHK_INIT_IF_RETURN();
-    cudaStream_t& stream = config.ctx.stream;
+    u64 t0, t1, t2, t3, t4;

-    uint8_t* input_device;
-    if (config.are_inputs_on_device) {
-      input_device = input;
-    } else {
-      CHK_IF_RETURN(cudaMallocAsync(&input_device, number_of_blocks * input_block_size, stream));
-      CHK_IF_RETURN(
-        cudaMemcpyAsync(input_device, input, number_of_blocks * input_block_size, cudaMemcpyHostToDevice, stream));
+    for (int i = 0; i < 24; i++) {
+      THETA(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      RHOPI(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      KHI(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      IOTA(s[0], RC[i]);
+    }
+  }
+
+  template <const int R>
+  __global__ void
+  keccak_hash_blocks(const uint8_t* input, int input_block_size, int output_len, int number_of_blocks, uint64_t* output)
+  {
+    int sid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (sid >= number_of_blocks) { return; }
+
+    const uint8_t* b_input = input + sid * input_block_size;
+    uint64_t* b_output = output + sid * output_len;
+    uint64_t state[25] = {}; // Initialize with zeroes
+
+    int input_len = input_block_size;
+
+    // absorb
+    while (input_len >= R) {
+      for (int i = 0; i < R; i += 8) {
+        state[i / 8] ^= *(uint64_t*)(b_input + i);
+      }
+      keccakf(state);
+      b_input += R;
+      input_len -= R;
    }

-    uint8_t* output_device;
-    if (config.are_outputs_on_device) {
-      output_device = output;
-    } else {
-      CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_blocks * (D / 8), stream));
+    // last block (if any)
+    uint8_t last_block[R];
+    for (int i = 0; i < input_len; i++) {
+      last_block[i] = b_input[i];
    }

-    int number_of_threads = 512;
-    int number_of_gpu_blocks = (number_of_blocks - 1) / number_of_threads + 1;
-    keccak_hash_blocks<C, D><<<number_of_gpu_blocks, number_of_threads, 0, stream>>>(
-      input_device, input_block_size, number_of_blocks, output_device);
+    // pad 10*1
+    last_block[input_len] = 1;
+    for (int i = 0; i < R - input_len - 1; i++) {
+      last_block[input_len + i + 1] = 0;
+    }
+    // last bit
+    last_block[R - 1] |= 0x80;

-    if (!config.are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(input_device, stream));
+    for (int i = 0; i < R; i += 8) {
+      state[i / 8] ^= *(uint64_t*)(last_block + i);
+    }
+    keccakf(state);

-    if (!config.are_outputs_on_device) {
-      CHK_IF_RETURN(cudaMemcpyAsync(output, output_device, number_of_blocks * (D / 8), cudaMemcpyDeviceToHost, stream));
-      CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
+    for (int i = 0; i < output_len; i++) {
+      b_output[i] = state[i];
+    }
+  }
+
+  cudaError_t Keccak::run_hash_many_kernel(
+    const uint8_t* input,
+    uint64_t* output,
+    unsigned int number_of_states,
+    unsigned int input_len,
+    unsigned int output_len,
+    const device_context::DeviceContext& ctx) const
+  {
+    int number_of_threads = 256;
+    int number_of_gpu_blocks = (number_of_states - 1) / number_of_threads + 1;
+
+    switch (rate) {
+    case 136:
+      keccak_hash_blocks<136><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
+        input, input_len, output_len, number_of_states, output);
+      break;
+    case 72:
+      keccak_hash_blocks<72><<<number_of_gpu_blocks, number_of_threads, 0, ctx.stream>>>(
+        input, input_len, output_len, number_of_states, output);
+      break;
+    default:
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "KeccakHash: #rate must be one of [136, 72]");
    }

-    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
+    CHK_IF_RETURN(cudaPeekAtLastError());
    return CHK_LAST();
  }
 } // namespace keccak
--- a/icicle/src/hash/keccak/kernels.cu
+++ b/icicle/src/hash/keccak/kernels.cu
@@ -1,233 +0,0 @@
-#pragma once
-#ifndef KECCAK_KERNELS_H
-#define KECCAK_KERNELS_H
-
-#include <cstdint>
-#include "gpu-utils/modifiers.cuh"
-
-namespace keccak {
-  using u64 = uint64_t;
-
-#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
-
-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)                                                              \
-  {                                                                                                                    \
-    t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4);                                                \
-  }
-
-#define THETA(                                                                                                         \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14);                                                      \
-    TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24);                                                      \
-    TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34);                                                      \
-    TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44);                                                      \
-    TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04);                                                      \
-    s00 ^= t0;                                                                                                         \
-    s01 ^= t0;                                                                                                         \
-    s02 ^= t0;                                                                                                         \
-    s03 ^= t0;                                                                                                         \
-    s04 ^= t0;                                                                                                         \
-                                                                                                                       \
-    s10 ^= t1;                                                                                                         \
-    s11 ^= t1;                                                                                                         \
-    s12 ^= t1;                                                                                                         \
-    s13 ^= t1;                                                                                                         \
-    s14 ^= t1;                                                                                                         \
-                                                                                                                       \
-    s20 ^= t2;                                                                                                         \
-    s21 ^= t2;                                                                                                         \
-    s22 ^= t2;                                                                                                         \
-    s23 ^= t2;                                                                                                         \
-    s24 ^= t2;                                                                                                         \
-                                                                                                                       \
-    s30 ^= t3;                                                                                                         \
-    s31 ^= t3;                                                                                                         \
-    s32 ^= t3;                                                                                                         \
-    s33 ^= t3;                                                                                                         \
-    s34 ^= t3;                                                                                                         \
-                                                                                                                       \
-    s40 ^= t4;                                                                                                         \
-    s41 ^= t4;                                                                                                         \
-    s42 ^= t4;                                                                                                         \
-    s43 ^= t4;                                                                                                         \
-    s44 ^= t4;                                                                                                         \
-  }
-
-#define RHOPI(                                                                                                         \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    t0 = ROTL64(s10, (uint64_t)1);                                                                                     \
-    s10 = ROTL64(s11, (uint64_t)44);                                                                                   \
-    s11 = ROTL64(s41, (uint64_t)20);                                                                                   \
-    s41 = ROTL64(s24, (uint64_t)61);                                                                                   \
-    s24 = ROTL64(s42, (uint64_t)39);                                                                                   \
-    s42 = ROTL64(s04, (uint64_t)18);                                                                                   \
-    s04 = ROTL64(s20, (uint64_t)62);                                                                                   \
-    s20 = ROTL64(s22, (uint64_t)43);                                                                                   \
-    s22 = ROTL64(s32, (uint64_t)25);                                                                                   \
-    s32 = ROTL64(s43, (uint64_t)8);                                                                                    \
-    s43 = ROTL64(s34, (uint64_t)56);                                                                                   \
-    s34 = ROTL64(s03, (uint64_t)41);                                                                                   \
-    s03 = ROTL64(s40, (uint64_t)27);                                                                                   \
-    s40 = ROTL64(s44, (uint64_t)14);                                                                                   \
-    s44 = ROTL64(s14, (uint64_t)2);                                                                                    \
-    s14 = ROTL64(s31, (uint64_t)55);                                                                                   \
-    s31 = ROTL64(s13, (uint64_t)45);                                                                                   \
-    s13 = ROTL64(s01, (uint64_t)36);                                                                                   \
-    s01 = ROTL64(s30, (uint64_t)28);                                                                                   \
-    s30 = ROTL64(s33, (uint64_t)21);                                                                                   \
-    s33 = ROTL64(s23, (uint64_t)15);                                                                                   \
-    s23 = ROTL64(s12, (uint64_t)10);                                                                                   \
-    s12 = ROTL64(s21, (uint64_t)6);                                                                                    \
-    s21 = ROTL64(s02, (uint64_t)3);                                                                                    \
-    s02 = t0;                                                                                                          \
-  }
-
-#define KHI(                                                                                                           \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    t0 = s00 ^ (~s10 & s20);                                                                                           \
-    t1 = s10 ^ (~s20 & s30);                                                                                           \
-    t2 = s20 ^ (~s30 & s40);                                                                                           \
-    t3 = s30 ^ (~s40 & s00);                                                                                           \
-    t4 = s40 ^ (~s00 & s10);                                                                                           \
-    s00 = t0;                                                                                                          \
-    s10 = t1;                                                                                                          \
-    s20 = t2;                                                                                                          \
-    s30 = t3;                                                                                                          \
-    s40 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s01 ^ (~s11 & s21);                                                                                           \
-    t1 = s11 ^ (~s21 & s31);                                                                                           \
-    t2 = s21 ^ (~s31 & s41);                                                                                           \
-    t3 = s31 ^ (~s41 & s01);                                                                                           \
-    t4 = s41 ^ (~s01 & s11);                                                                                           \
-    s01 = t0;                                                                                                          \
-    s11 = t1;                                                                                                          \
-    s21 = t2;                                                                                                          \
-    s31 = t3;                                                                                                          \
-    s41 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s02 ^ (~s12 & s22);                                                                                           \
-    t1 = s12 ^ (~s22 & s32);                                                                                           \
-    t2 = s22 ^ (~s32 & s42);                                                                                           \
-    t3 = s32 ^ (~s42 & s02);                                                                                           \
-    t4 = s42 ^ (~s02 & s12);                                                                                           \
-    s02 = t0;                                                                                                          \
-    s12 = t1;                                                                                                          \
-    s22 = t2;                                                                                                          \
-    s32 = t3;                                                                                                          \
-    s42 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s03 ^ (~s13 & s23);                                                                                           \
-    t1 = s13 ^ (~s23 & s33);                                                                                           \
-    t2 = s23 ^ (~s33 & s43);                                                                                           \
-    t3 = s33 ^ (~s43 & s03);                                                                                           \
-    t4 = s43 ^ (~s03 & s13);                                                                                           \
-    s03 = t0;                                                                                                          \
-    s13 = t1;                                                                                                          \
-    s23 = t2;                                                                                                          \
-    s33 = t3;                                                                                                          \
-    s43 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s04 ^ (~s14 & s24);                                                                                           \
-    t1 = s14 ^ (~s24 & s34);                                                                                           \
-    t2 = s24 ^ (~s34 & s44);                                                                                           \
-    t3 = s34 ^ (~s44 & s04);                                                                                           \
-    t4 = s44 ^ (~s04 & s14);                                                                                           \
-    s04 = t0;                                                                                                          \
-    s14 = t1;                                                                                                          \
-    s24 = t2;                                                                                                          \
-    s34 = t3;                                                                                                          \
-    s44 = t4;                                                                                                          \
-  }
-
-#define IOTA(element, rc)                                                                                              \
-  {                                                                                                                    \
-    element ^= rc;                                                                                                     \
-  }
-
-  __device__ const u64 RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
-                                 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
-                                 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-                                 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
-                                 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
-                                 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
-
-  __device__ void keccakf(u64 s[25])
-  {
-    u64 t0, t1, t2, t3, t4;
-
-    for (int i = 0; i < 24; i++) {
-      THETA(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      RHOPI(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      KHI(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      IOTA(s[0], RC[i]);
-    }
-  }
-
-  template <int C, int D>
-  __global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
-  {
-    int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (bid >= number_of_blocks) { return; }
-
-    const int r_bits = 1600 - C;
-    const int r_bytes = r_bits / 8;
-    const int d_bytes = D / 8;
-
-    uint8_t* b_input = input + bid * input_block_size;
-    uint8_t* b_output = output + bid * d_bytes;
-    uint64_t state[25] = {}; // Initialize with zeroes
-
-    int input_len = input_block_size;
-
-    // absorb
-    while (input_len >= r_bytes) {
-      // #pragma unroll
-      for (int i = 0; i < r_bytes; i += 8) {
-        state[i / 8] ^= *(uint64_t*)(b_input + i);
-      }
-      keccakf(state);
-      b_input += r_bytes;
-      input_len -= r_bytes;
-    }
-
-    // last block (if any)
-    uint8_t last_block[r_bytes];
-    for (int i = 0; i < input_len; i++) {
-      last_block[i] = b_input[i];
-    }
-
-    // pad 10*1
-    last_block[input_len] = 1;
-    for (int i = 0; i < r_bytes - input_len - 1; i++) {
-      last_block[input_len + i + 1] = 0;
-    }
-    // last bit
-    last_block[r_bytes - 1] |= 0x80;
-
-    // #pragma unroll
-    for (int i = 0; i < r_bytes; i += 8) {
-      state[i / 8] ^= *(uint64_t*)(last_block + i);
-    }
-    keccakf(state);
-
-#pragma unroll
-    for (int i = 0; i < d_bytes; i += 8) {
-      *(uint64_t*)(b_output + i) = state[i / 8];
-    }
-  }
-} // namespace keccak
-
-#endif
--- a/icicle/src/hash/keccak/test.cu
+++ b/icicle/src/hash/keccak/test.cu
@@ -50,7 +50,7 @@ int main(int argc, char* argv[])
  uint8_t* out_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * (D / 8)));

  START_TIMER(keccak_timer);
-  KeccakConfig config = default_keccak_config();
+  HashConfig config = default_hash_config();
  keccak256_cuda(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
  END_TIMER(keccak_timer, "Keccak")

--- a/icicle/src/hash/keccak/test_tree.cu
+++ b/icicle/src/hash/keccak/test_tree.cu
@@ -0,0 +1,91 @@
+#include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "extern.cu"
+
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+using namespace keccak;
+
+#define D 256
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+void uint8_to_hex_string(const uint8_t* values, int size)
+{
+  std::stringstream ss;
+
+  for (int i = 0; i < size; ++i) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i];
+  }
+
+  std::string hexString = ss.str();
+  std::cout << hexString << std::endl;
+}
+
+#define A 2
+
+int main(int argc, char* argv[])
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  /// Tree of height N and arity A contains \sum{A^i} for i in 0..N-1 elements
+  uint32_t input_block_len = 136;
+  uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 10;
+  uint32_t number_of_leaves = pow(A, tree_height);
+  uint32_t total_number_of_leaves = number_of_leaves * input_block_len;
+
+  /// Use keep_rows to specify how many rows do you want to store
+  int keep_rows = argc > 2 ? atoi(argv[2]) : 7;
+  size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, A, 1);
+
+  /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
+  START_TIMER(timer_allocation);
+  uint8_t input = 0;
+  uint8_t* leaves = static_cast<uint8_t*>(malloc(total_number_of_leaves));
+  for (uint64_t i = 0; i < total_number_of_leaves; i++) {
+    leaves[i] = (uint8_t)i;
+  }
+  END_TIMER(timer_allocation, "Allocated memory for leaves: ");
+
+  /// Allocate memory for digests of {keep_rows} rows of a tree
+  START_TIMER(timer_digests);
+  size_t digests_mem = digests_len * sizeof(uint64_t);
+  uint64_t* digests = static_cast<uint64_t*>(malloc(digests_mem));
+  END_TIMER(timer_digests, "Allocated memory for digests");
+
+  std::cout << "Memory for leaves = " << total_number_of_leaves / 1024 / 1024 << " MB; "
+            << total_number_of_leaves / 1024 / 1024 / 1024 << " GB" << std::endl;
+  std::cout << "Number of leaves = " << number_of_leaves << std::endl;
+  std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
+  std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
+            << " GB" << std::endl;
+  std::cout << "Number of digest elements = " << digests_len << std::endl;
+
+  std::cout << "Total RAM consumption = " << (digests_mem + total_number_of_leaves) / 1024 / 1024 << " MB; "
+            << (digests_mem + total_number_of_leaves) / 1024 / 1024 / 1024 << " GB" << std::endl;
+
+  merkle_tree::TreeBuilderConfig config = merkle_tree::default_merkle_config();
+  config.arity = A;
+  config.keep_rows = keep_rows;
+  START_TIMER(keccak_timer);
+  build_keccak256_merkle_tree_cuda(leaves, digests, tree_height, input_block_len, config);
+  END_TIMER(keccak_timer, "Keccak")
+
+  for (int i = 0; i < digests_len; i++) {
+    uint64_t root = digests[i];
+    std::cout << root << std::endl;
+    // assert(root == expected[i]);
+  }
+  free(digests);
+  free(leaves);
+}
+
+#endif
--- a/icicle/src/merkle-tree/extern.cu
+++ b/icicle/src/merkle-tree/extern.cu
@@ -15,8 +15,8 @@ namespace merkle_tree {
    scalar_t* digests,
    unsigned int height,
    unsigned int input_block_len,
-    const hash::SpongeHasher<scalar_t, scalar_t>* compression,
-    const hash::SpongeHasher<scalar_t, scalar_t>* bottom_layer,
+    const hash::Hasher<scalar_t, scalar_t>* compression,
+    const hash::Hasher<scalar_t, scalar_t>* bottom_layer,
    const TreeBuilderConfig& tree_config)
  {
    return build_merkle_tree<scalar_t, scalar_t>(
--- a/icicle/src/merkle-tree/extern_mmcs.cu
+++ b/icicle/src/merkle-tree/extern_mmcs.cu
@@ -17,8 +17,8 @@ namespace merkle_tree {
    const Matrix<scalar_t>* leaves,
    unsigned int number_of_inputs,
    scalar_t* digests,
-    const hash::SpongeHasher<scalar_t, scalar_t>* hasher,
-    const hash::SpongeHasher<scalar_t, scalar_t>* compression,
+    const hash::Hasher<scalar_t, scalar_t>* hasher,
+    const hash::Hasher<scalar_t, scalar_t>* compression,
    const TreeBuilderConfig& tree_config)
  {
    return mmcs_commit<scalar_t, scalar_t>(leaves, number_of_inputs, digests, *hasher, *compression, tree_config);
--- a/icicle/src/merkle-tree/merkle.cu
+++ b/icicle/src/merkle-tree/merkle.cu
@@ -78,37 +78,37 @@ namespace merkle_tree {
  template <typename L, typename D>
  cudaError_t build_merkle_subtree(
    const L* leaves,
-    D* states,
+    L* d_leaves,
    D* digests,
    size_t subtree_idx,
    size_t subtree_height,
-    L* big_tree_digests,
+    D* big_tree_digests,
    size_t start_segment_size,
    size_t start_segment_offset,
    uint64_t keep_rows,
    uint64_t input_block_len,
-    const SpongeHasher<L, D>& bottom_layer,
-    const SpongeHasher<L, D>& compression,
+    const Hasher<L, D>& bottom_layer,
+    const Hasher<L, D>& compression,
    const TreeBuilderConfig& tree_config,
    device_context::DeviceContext& ctx)
  {
    uint64_t arity = tree_config.arity;

-    SpongeConfig sponge_config = default_sponge_config(ctx);
-    sponge_config.are_inputs_on_device = true;
-    sponge_config.are_outputs_on_device = true;
-    sponge_config.is_async = true;
+    HashConfig hash_config = default_hash_config(ctx);
+    hash_config.are_inputs_on_device = true;
+    hash_config.are_outputs_on_device = true;
+    hash_config.is_async = true;

    size_t bottom_layer_states = pow(arity, subtree_height);

    if (!tree_config.are_inputs_on_device) {
      CHK_IF_RETURN(cudaMemcpyAsync(
-        states, leaves, bottom_layer_states * input_block_len * sizeof(L), cudaMemcpyHostToDevice, ctx.stream));
+        d_leaves, leaves, bottom_layer_states * input_block_len * sizeof(L), cudaMemcpyHostToDevice, ctx.stream));
    }

    bottom_layer.hash_many(
-      tree_config.are_inputs_on_device ? leaves : states, digests, bottom_layer_states, input_block_len,
-      tree_config.digest_elements, sponge_config);
+      tree_config.are_inputs_on_device ? leaves : d_leaves, digests, bottom_layer_states, input_block_len,
+      tree_config.digest_elements, hash_config);

    uint64_t number_of_states = bottom_layer_states / arity;
    size_t segment_size = start_segment_size;
@@ -123,21 +123,24 @@ namespace merkle_tree {
    }
    segment_size /= arity;
    subtree_height--;
-    swap<D>(&digests, &states);
+
+    D* prev_layer = digests;
+    D* next_layer = (D*)d_leaves;

    while (number_of_states > 0) {
-      CHK_IF_RETURN(
-        compression.compress_many(states, digests, number_of_states, tree_config.digest_elements, sponge_config));
+      CHK_IF_RETURN(compression.run_hash_many_kernel(
+        (L*)prev_layer, next_layer, number_of_states, tree_config.digest_elements * tree_config.arity,
+        tree_config.digest_elements, hash_config.ctx));

      if (!keep_rows || subtree_height < keep_rows) {
        D* digests_with_offset =
          big_tree_digests + segment_offset + subtree_idx * number_of_states * tree_config.digest_elements;
        CHK_IF_RETURN(cudaMemcpyAsync(
-          digests_with_offset, digests, number_of_states * tree_config.digest_elements * sizeof(D),
+          digests_with_offset, next_layer, number_of_states * tree_config.digest_elements * sizeof(D),
          cudaMemcpyDeviceToHost, ctx.stream));
        segment_offset += segment_size;
      }
-      if (number_of_states > 1) { swap<D>(&digests, &states); }
+      swap<D>(&prev_layer, &next_layer);
      segment_size /= arity;
      subtree_height--;
      number_of_states /= arity;
@@ -152,17 +155,13 @@ namespace merkle_tree {
    D* digests,
    unsigned int height,
    unsigned int input_block_len,
-    const SpongeHasher<L, D>& compression,
-    const SpongeHasher<L, D>& bottom_layer,
+    const Hasher<L, D>& compression,
+    const Hasher<L, D>& bottom_layer,
    const TreeBuilderConfig& tree_config)
  {
    CHK_INIT_IF_RETURN();
    cudaStream_t& stream = tree_config.ctx.stream;

-    if (input_block_len * sizeof(L) > bottom_layer.rate * sizeof(D))
-      THROW_ICICLE_ERR(
-        IcicleError_t::InvalidArgument,
-        "Sponge construction at the bottom of the tree doesn't support inputs bigger than hash rate");
    if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
      THROW_ICICLE_ERR(
        IcicleError_t::InvalidArgument,
@@ -176,24 +175,21 @@ namespace merkle_tree {
    uint64_t number_of_subtrees = 1;
    uint64_t subtree_height = height;
    uint64_t subtree_bottom_layer_states = number_of_bottom_layer_states;
-    uint64_t subtree_states_size = subtree_bottom_layer_states * bottom_layer.width;
+    uint64_t subtree_leaves_size = subtree_bottom_layer_states * input_block_len;
+    uint64_t subtree_digests_size = subtree_bottom_layer_states * tree_config.digest_elements;

-    uint64_t subtree_digests_size;
-    if (compression.width != compression.preimage_max_length) {
-      // In that case, the states on layer 1 will require extending the states by (width / preimage_max_len) factor
-      subtree_digests_size =
-        subtree_states_size * bottom_layer.preimage_max_length / bottom_layer.width * tree_config.digest_elements;
-    } else {
-      subtree_digests_size = subtree_states_size / bottom_layer.width * tree_config.digest_elements;
-    }
-    size_t subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
+    size_t subtree_d_leaves_memory = std::max(
+      tree_config.are_inputs_on_device ? 0 : (sizeof(L) * subtree_leaves_size),
+      subtree_digests_size * sizeof(D) / tree_config.arity);
+    size_t subtree_memory_required = sizeof(D) * subtree_digests_size + subtree_d_leaves_memory;
    while (subtree_memory_required > STREAM_CHUNK_SIZE) {
      number_of_subtrees *= tree_config.arity;
      subtree_height--;
      subtree_bottom_layer_states /= tree_config.arity;
-      subtree_states_size /= tree_config.arity;
      subtree_digests_size /= tree_config.arity;
-      subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
+      subtree_leaves_size /= tree_config.arity;
+      subtree_d_leaves_memory /= tree_config.arity;
+      subtree_memory_required = sizeof(D) * subtree_digests_size + subtree_d_leaves_memory;
    }
    int cap_height = height - subtree_height;
    size_t caps_len = pow(tree_config.arity, cap_height) * tree_config.digest_elements;
@@ -221,19 +217,18 @@ namespace merkle_tree {
    std::cout << "Height of a subtree = " << subtree_height << std::endl;
    std::cout << "Cutoff height = " << height - subtree_height << std::endl;
    std::cout << "Number of leaves in a subtree = " << subtree_bottom_layer_states << std::endl;
-    std::cout << "State of a subtree = " << subtree_states_size << std::endl;
    std::cout << "Digest elements for a subtree = " << subtree_digests_size << std::endl;
-    std::cout << "Size of 1 subtree states = " << subtree_states_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
    std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
    std::cout << "Cap height = " << cap_height << std::endl;
    std::cout << "Enabling caps mode? " << caps_mode << std::endl;
+    std::cout << "Allocated " << subtree_d_leaves_memory << " bytes for d_leaves" << std::endl;
 #endif

    // Allocate memory for the leaves and digests
    // These are shared by streams in a pool
-    D *states_ptr, *digests_ptr;
-    CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_states_size * number_of_streams * sizeof(D), stream));
-    CHK_IF_RETURN(cudaMemsetAsync(states_ptr, 0, subtree_states_size * number_of_streams * sizeof(D), stream));
+    L* d_leaves_ptr;
+    D* digests_ptr;
+    CHK_IF_RETURN(cudaMallocAsync(&d_leaves_ptr, subtree_d_leaves_memory * number_of_streams, stream));
    CHK_IF_RETURN(cudaMallocAsync(&digests_ptr, subtree_digests_size * number_of_streams * sizeof(D), stream));
    // Wait for these allocations to finish
    CHK_IF_RETURN(cudaStreamSynchronize(stream));
@@ -244,7 +239,7 @@ namespace merkle_tree {
      cudaStream_t subtree_stream = streams[stream_idx];

      const L* subtree_leaves = leaves + subtree_idx * subtree_bottom_layer_states * input_block_len;
-      D* subtree_state = states_ptr + stream_idx * subtree_states_size;
+      L* subtree_d_leaves = (L*)((unsigned char*)d_leaves_ptr + stream_idx * subtree_d_leaves_memory);
      D* subtree_digests = digests_ptr + stream_idx * subtree_digests_size;

      int subtree_keep_rows = 0;
@@ -257,7 +252,7 @@ namespace merkle_tree {
      uint64_t start_segment_size = number_of_bottom_layer_states * tree_config.digest_elements;
      cudaError_t subtree_result = build_merkle_subtree<L, D>(
        subtree_leaves,             // leaves
-        subtree_state,              // state
+        subtree_d_leaves,           // d_leves
        subtree_digests,            // digests
        subtree_idx,                // subtree_idx
        subtree_height,             // subtree_height
@@ -278,10 +273,6 @@ namespace merkle_tree {
      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
    }

-    SpongeConfig sponge_config = default_sponge_config(tree_config.ctx);
-    sponge_config.are_inputs_on_device = tree_config.are_inputs_on_device;
-    sponge_config.are_outputs_on_device = true;
-    sponge_config.is_async = true;
    // Finish the top-level tree if any
    if (cap_height > 0) {
      size_t start_segment_size = caps_len / tree_config.arity;
@@ -295,25 +286,29 @@ namespace merkle_tree {
        }
      }
      CHK_IF_RETURN(cudaMemcpyAsync(
-        states_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
+        d_leaves_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
        (caps_mode || !tree_config.are_outputs_on_device) ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, stream));

      uint64_t number_of_states = caps_len / tree_config.arity / tree_config.digest_elements;

+      D* prev_layer = (D*)d_leaves_ptr;
+      D* next_layer = digests_ptr;
+
      size_t segment_size = start_segment_size;
      size_t segment_offset = start_segment_offset;
      while (number_of_states > 0) {
-        CHK_IF_RETURN(compression.compress_many(
-          states_ptr, digests_ptr, number_of_states, tree_config.digest_elements, sponge_config));
+        CHK_IF_RETURN(compression.run_hash_many_kernel(
+          (L*)prev_layer, next_layer, number_of_states, tree_config.digest_elements * tree_config.arity,
+          tree_config.digest_elements, tree_config.ctx));
        if (!tree_config.keep_rows || cap_height < tree_config.keep_rows + (int)caps_mode) {
          D* digests_with_offset = digests + segment_offset;
          CHK_IF_RETURN(cudaMemcpyAsync(
-            digests_with_offset, digests_ptr, number_of_states * tree_config.digest_elements * sizeof(D),
+            digests_with_offset, next_layer, number_of_states * tree_config.digest_elements * sizeof(D),
            cudaMemcpyDeviceToHost, stream));
          segment_offset += segment_size;
        }

-        if (number_of_states > 1) { swap<D>(&digests_ptr, &states_ptr); }
+        swap<D>(&prev_layer, &next_layer);

        segment_size /= tree_config.arity;
        cap_height--;
@@ -322,7 +317,7 @@ namespace merkle_tree {
      if (caps_mode) { free(caps); }
    }

-    CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
+    CHK_IF_RETURN(cudaFreeAsync(d_leaves_ptr, stream));
    CHK_IF_RETURN(cudaFreeAsync(digests_ptr, stream));
    if (!tree_config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < number_of_streams; i++) {
--- a/icicle/src/merkle-tree/mmcs.cu
+++ b/icicle/src/merkle-tree/mmcs.cu
@@ -16,10 +16,10 @@ namespace merkle_tree {
    uint64_t number_of_rows,
    D* digests,
    unsigned int digest_elements,
-    const SpongeHasher<L, D>& hasher,
+    const Hasher<L, D>& hasher,
    const device_context::DeviceContext& ctx)
  {
-    SpongeConfig sponge_config = default_sponge_config(ctx);
+    HashConfig sponge_config = default_hash_config(ctx);
    sponge_config.are_inputs_on_device = true;
    sponge_config.are_outputs_on_device = true;
    sponge_config.is_async = true;
@@ -57,8 +57,8 @@ namespace merkle_tree {
    unsigned int keep_rows;                  // Number of rows to keep
    bool are_inputs_on_device;
    bool caps_mode;
-    const SpongeHasher<L, D>* hasher = nullptr;
-    const SpongeHasher<L, D>* compression = nullptr;
+    const Hasher<L, D>* hasher = nullptr;
+    const Hasher<L, D>* compression = nullptr;
    const device_context::DeviceContext* ctx = nullptr;
  };

@@ -189,8 +189,8 @@ namespace merkle_tree {
    const Matrix<L>* inputs,
    const unsigned int number_of_inputs,
    D* digests,
-    const SpongeHasher<L, D>& hasher,
-    const SpongeHasher<L, D>& compression,
+    const Hasher<L, D>& hasher,
+    const Hasher<L, D>& compression,
    const TreeBuilderConfig& tree_config)
  {
    CHK_INIT_IF_RETURN();
--- a/icicle/src/merkle-tree/tests/merkle/Makefile
+++ b/icicle/src/merkle-tree/tests/merkle/Makefile
@@ -8,8 +8,7 @@ merkle_bls.o: ../../extern.cu ../../merkle.cu
 poseidon.o: ../../../poseidon/extern.cu
 	nvcc -o poseidon.o -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -c ../../../poseidon/extern.cu

-
-test_merkle: poseidon2.o merkle.o 
+test_merkle: test_poseidon2.cu poseidon2.o merkle.o 
 	nvcc -o test_merkle -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG poseidon2.o merkle.o test_poseidon2.cu
 	./test_merkle

--- a/icicle/src/merkle-tree/tests/merkle/test.cu
+++ b/icicle/src/merkle-tree/tests/merkle/test.cu
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <math.h>

+#define DEBUG
 #include "merkle-tree/merkle.cuh"

 #include "poseidon/poseidon.cuh"
--- a/icicle/src/merkle-tree/tests/merkle/test_poseidon2.cu
+++ b/icicle/src/merkle-tree/tests/merkle/test_poseidon2.cu
@@ -89,16 +89,17 @@ int main(int argc, char* argv[])
  // }

  scalar_t expected[64] = {
-    {1198029810}, {1114813365}, {241588005},  {735332587},  {201392606},  {623383436},  {60086186},   {1225304654},
-    {1501472115}, {891216097},  {184481194},  {855632748},  {1503541944}, {1483537725}, {1023563730}, {698957505},
-    {1322038939}, {1132881200}, {104782797},  {68847168},   {420051722},  {126069919},  {1350263697}, {1711085395},
-    {1322038939}, {1132881200}, {104782797},  {68847168},   {420051722},  {126069919},  {1350263697}, {1711085395},
-    {1019525203}, {127215304},  {1199733491}, {1473997036}, {548538385},  {364347137},  {570748364},  {426431873},
-    {926562920},  {6278762},    {1894248581}, {1304248433}, {1635020421}, {719342960},  {1373719279}, {700539301},
-    {708916911},  {925660920},  {994927540},  {1925434995}, {208534303},  {69614512},   {1701199215}, {1825115630}};
+    {876845485},  {1982055884}, {1232961929}, {1502814326}, {1731913687}, {351564698},  {449044700},  {656218013},
+    {1616800877}, {1324365320}, {651075613},  {1679193452}, {218302636},  {283697394},  {1141456517}, {253630808},
+    {936036237},  {1020969125}, {597252945},  {32839064},   {957901845},  {1137914369}, {155933167},  {986924657},
+    {1553746264}, {1007314324}, {1208763331}, {110389244},  {118704360},  {607471513},  {834479233},  {914998571},
+    {1086906039}, {1673233108}, {431115765},  {233068973},  {1974449092}, {1296268875}, {538093590},  {104288129},
+    {1011605567}, {53314351},   {1461404090}, {870754513},  {1212389386}, {1363519118}, {799527383},  {1258384762},
+    {678820782},  {1940801563}, {887764924},  {1006362075}, {2003940909}, {1213396717}, {1332793191}, {440259232}};

  for (int i = 0; i < digests_len; i++) {
    scalar_t root = digests[i];
+    // std::cout << root << std::endl;
    assert(root == expected[i]);
  }
  free(digests);
--- a/icicle/src/poseidon/extern.cu
+++ b/icicle/src/poseidon/extern.cu
@@ -52,7 +52,7 @@ namespace poseidon {
    unsigned int number_of_states,
    unsigned int input_block_len,
    unsigned int output_len,
-    const SpongeConfig& cfg)
+    const HashConfig& cfg)
  {
    return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
  }
--- a/icicle/src/poseidon/test.cu
+++ b/icicle/src/poseidon/test.cu
@@ -48,7 +48,7 @@ int main(int argc, char* argv[])

  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));

-  SpongeConfig cfg = default_sponge_config();
+  HashConfig cfg = default_hash_config();

  START_TIMER(poseidon_timer);
  poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, A, 1, cfg);
--- a/icicle/src/poseidon2/extern.cu
+++ b/icicle/src/poseidon2/extern.cu
@@ -56,7 +56,7 @@ namespace poseidon2 {
    unsigned int number_of_states,
    unsigned int input_block_len,
    unsigned int output_len,
-    hash::SpongeConfig& cfg)
+    hash::HashConfig& cfg)
  {
    return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
  }
--- a/icicle/src/poseidon2/test.cu
+++ b/icicle/src/poseidon2/test.cu
@@ -42,7 +42,7 @@ int main(int argc, char* argv[])

  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));

-  SpongeConfig cfg = default_sponge_config();
+  HashConfig cfg = default_hash_config();

  START_TIMER(poseidon_timer);
  poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, T, 1, cfg);
--- a/icicle/src/poseidon2/test_m31.cu
+++ b/icicle/src/poseidon2/test_m31.cu
@@ -36,7 +36,7 @@ int main(int argc, char* argv[])
  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
  scalar_t input = scalar_t::zero();

-  hash::SpongeConfig cfg = hash::default_sponge_config();
+  hash::HashConfig cfg = hash::default_hash_config();

  size_t number_of_repetitions = argc > 2 ? 1 << atoi(argv[2]) : 32;

--- a/wrappers/golang/core/sponge.go
+++ b/wrappers/golang/core/sponge.go
@@ -6,7 +6,7 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 )

-type SpongeConfig struct {
+type HashConfig struct {
 	/// Details related to the device such as its id and stream.
 	Ctx cr.DeviceContext

@@ -31,9 +31,9 @@ type SpongeConfig struct {
 	IsAsync bool
 }

-func GetDefaultSpongeConfig() SpongeConfig {
+func GetDefaultHashConfig() HashConfig {
 	ctx, _ := cr.GetDefaultDeviceContext()
-	return SpongeConfig{
+	return HashConfig{
 		ctx,
 		false,
 		false,
--- a/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t bls12_377_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t bls12_377_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t bls12_377_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/curves/bls12377/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bls12377/poseidon/poseidon.go
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.bls12_377_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/curves/bls12377/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bls12377/tests/poseidon_test.go
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := bls12_377.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/golang/curves/bls12381/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bls12381/poseidon/include/poseidon.h
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t bls12_381_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t bls12_381_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t bls12_381_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/curves/bls12381/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bls12381/poseidon/poseidon.go
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.bls12_381_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/curves/bls12381/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bls12381/tests/poseidon_test.go
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := bls12_381.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/golang/curves/bn254/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bn254/poseidon/include/poseidon.h
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t bn254_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t bn254_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t bn254_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/curves/bn254/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bn254/poseidon/poseidon.go
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.bn254_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/curves/bn254/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bn254/tests/poseidon_test.go
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := bn254.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/golang/curves/bw6761/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bw6761/poseidon/include/poseidon.h
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t bw6_761_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t bw6_761_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t bw6_761_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/curves/bw6761/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bw6761/poseidon/poseidon.go
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.bw6_761_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/curves/bw6761/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bw6761/tests/poseidon_test.go
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := bw6_761.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/golang/curves/grumpkin/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/grumpkin/poseidon/include/poseidon.h
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t grumpkin_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t grumpkin_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t grumpkin_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/curves/grumpkin/poseidon/poseidon.go
+++ b/wrappers/golang/curves/grumpkin/poseidon/poseidon.go
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.grumpkin_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/curves/grumpkin/tests/poseidon_test.go
+++ b/wrappers/golang/curves/grumpkin/tests/poseidon_test.go
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := grumpkin.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/golang/hash/keccak/hasher.go
+++ b/wrappers/golang/hash/keccak/hasher.go
@@ -19,16 +19,16 @@ const (
 	Hash512 HashSize = 512
 )

-type KeccakConfig struct {
+type HashConfig struct {
 	Ctx                cr.DeviceContext
 	areInputsOnDevice  bool
 	areOutputsOnDevice bool
 	IsAsync            bool
 }

-func GetDefaultKeccakConfig() KeccakConfig {
+func GetDefaultHashConfig() HashConfig {
 	ctx, _ := cr.GetDefaultDeviceContext()
-	return KeccakConfig{
+	return HashConfig{
 		ctx,
 		false,
 		false,
@@ -36,7 +36,7 @@ func GetDefaultKeccakConfig() KeccakConfig {
 	}
 }

-func keccakCheck(input core.HostOrDeviceSlice, output core.HostOrDeviceSlice, cfg *KeccakConfig, hashSize HashSize, numberOfBlocks int32) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer) {
+func keccakCheck(input core.HostOrDeviceSlice, output core.HostOrDeviceSlice, cfg *HashConfig, hashSize HashSize, numberOfBlocks int32) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer) {
 	cfg.areInputsOnDevice = input.IsOnDevice()
 	cfg.areOutputsOnDevice = output.IsOnDevice()

@@ -61,13 +61,13 @@ func keccakCheck(input core.HostOrDeviceSlice, output core.HostOrDeviceSlice, cf
 	return input.AsUnsafePointer(), output.AsUnsafePointer(), unsafe.Pointer(cfg)
 }

-func keccak(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig, hashSize HashSize) (ret core.IcicleError) {
+func keccak(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig, hashSize HashSize) (ret core.IcicleError) {
 	inputPointer, outputPointer, cfgPointer := keccakCheck(input, output, config, hashSize, numberOfBlocks)
 	cInput := (*C.uint8_t)(inputPointer)
 	cOutput := (*C.uint8_t)(outputPointer)
 	cInputBlockSize := (C.int)(inputBlockSize)
 	cNumberOfBlocks := (C.int)(numberOfBlocks)
-	cConfig := (*C.KeccakConfig)(cfgPointer)
+	cConfig := (*C.HashConfig)(cfgPointer)

 	switch hashSize {
 	case Hash256:
@@ -79,10 +79,10 @@ func keccak(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32,
 	return ret
 }

-func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError {
+func Keccak256(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError {
 	return keccak(input, inputBlockSize, numberOfBlocks, output, config, Hash256)
 }

-func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *KeccakConfig) core.IcicleError {
+func Keccak512(input core.HostOrDeviceSlice, inputBlockSize, numberOfBlocks int32, output core.HostOrDeviceSlice, config *HashConfig) core.IcicleError {
 	return keccak(input, inputBlockSize, numberOfBlocks, output, config, Hash512)
 }
--- a/wrappers/golang/hash/keccak/include/keccak.h
+++ b/wrappers/golang/hash/keccak/include/keccak.h
@@ -8,10 +8,10 @@
 extern "C" {
 #endif

-typedef struct KeccakConfig KeccakConfig;
+typedef struct HashConfig HashConfig;

-cudaError_t keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig* config);
-cudaError_t keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig* config);
+cudaError_t keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig* config);
+cudaError_t keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, HashConfig* config);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/hash/keccak/tests/hasher_test.go
+++ b/wrappers/golang/hash/keccak/tests/hasher_test.go
@@ -23,7 +23,7 @@ func TestSimpleHash256(t *testing.T) {
 	input := createHostSliceFromHexString("1725b6679911bfe75ad7e248fbeec4a01034feace99aca43cd95d338a37db8d04b4aa5d83c8f8f5bdb8f7f98cec9a658f7f8061a6de07dcfd61db169cc7e666e1971adb4c7e97c43361c9a9eed8bb7b5c33cfe576a383a7440803996fd28148b")
 	outHost := make(core.HostSlice[uint8], 32)

-	cfg := keccak.GetDefaultKeccakConfig()
+	cfg := keccak.GetDefaultHashConfig()
 	e := keccak.Keccak256(input, int32(input.Len()), 1, outHost, &cfg)
 	assert.Equal(t, e.CudaErrorCode, cr.CudaSuccess, "Hashing failed")
 	t.Log(outHost)
@@ -34,7 +34,7 @@ func TestBatchHash256(t *testing.T) {
 	input := createHostSliceFromHexString("1725b6679911bfe75ad7e248fbeec4a01034feace99aca43cd95d338a37db8d04b4aa5d83c8f8f5bdb8f7f98cec9a658f7f8061a6de07dcfd61db169cc7e666e1971adb4c7e97c43361c9a9eed8bb7b5c33cfe576a383a7440803996fd28148b")
 	outHost := make(core.HostSlice[uint8], 32*2)

-	cfg := keccak.GetDefaultKeccakConfig()
+	cfg := keccak.GetDefaultHashConfig()
 	e := keccak.Keccak256(input, int32(input.Len()/2), 2, outHost, &cfg)
 	assert.Equal(t, e.CudaErrorCode, cr.CudaSuccess, "Hashing failed")
 	t.Log(outHost)
@@ -46,7 +46,7 @@ func TestSimpleHash512(t *testing.T) {
 	input := createHostSliceFromHexString("1725b6679911bfe75ad7e248fbeec4a01034feace99aca43cd95d338a37db8d04b4aa5d83c8f8f5bdb8f7f98cec9a658f7f8061a6de07dcfd61db169cc7e666e1971adb4c7e97c43361c9a9eed8bb7b5c33cfe576a383a7440803996fd28148b")
 	outHost := make(core.HostSlice[uint8], 64)

-	cfg := keccak.GetDefaultKeccakConfig()
+	cfg := keccak.GetDefaultHashConfig()
 	e := keccak.Keccak512(input, int32(input.Len()), 1, outHost, &cfg)
 	assert.Equal(t, e.CudaErrorCode, cr.CudaSuccess, "Hashing failed")
 	t.Log(outHost)
@@ -57,7 +57,7 @@ func TestBatchHash512(t *testing.T) {
 	input := createHostSliceFromHexString("1725b6679911bfe75ad7e248fbeec4a01034feace99aca43cd95d338a37db8d04b4aa5d83c8f8f5bdb8f7f98cec9a658f7f8061a6de07dcfd61db169cc7e666e1971adb4c7e97c43361c9a9eed8bb7b5c33cfe576a383a7440803996fd28148b")
 	outHost := make(core.HostSlice[uint8], 64*2)

-	cfg := keccak.GetDefaultKeccakConfig()
+	cfg := keccak.GetDefaultHashConfig()
 	e := keccak.Keccak512(input, int32(input.Len()/2), 2, outHost, &cfg)
 	assert.Equal(t, e.CudaErrorCode, cr.CudaSuccess, "Hashing failed")
 	t.Log(outHost)
--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon.go.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon.go.tmpl
@@ -58,7 +58,7 @@ func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
 	return &p, err
 }

-func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.HashConfig) core.IcicleError {
 	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
 	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

@@ -67,7 +67,7 @@ func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.Ho
 	cNumberOfStates := (C.uint)(numberOfStates)
 	cInputBlockLen := (C.uint)(inputBlockLen)
 	cOutputLen := (C.uint)(outputLen)
-	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	cCfg := (*C.HashConfig)(unsafe.Pointer(cfg))
 	__ret := C.{{.Field}}_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
@@ -79,8 +79,8 @@ func (poseidon *Poseidon) Delete() core.IcicleError {
 	return core.FromCudaError(err)
 }

-func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
-	cfg := core.GetDefaultSpongeConfig()
+func (poseidon *Poseidon) GetDefaultHashConfig() core.HashConfig {
+	cfg := core.GetDefaultHashConfig()
 	cfg.InputRate = poseidon.width - 1
 	cfg.OutputRate = poseidon.width
 	return cfg
--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon.h.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon.h.tmpl
@@ -12,7 +12,7 @@ typedef struct scalar_t scalar_t;
 typedef struct DeviceContext DeviceContext;
 typedef struct TreeBuilderConfig TreeBuilderConfig;
 typedef struct PoseidonInst PoseidonInst;
-typedef struct SpongeConfig SpongeConfig;
+typedef struct HashConfig HashConfig;


 cudaError_t {{.Field}}_poseidon_create_cuda(
@@ -40,7 +40,7 @@ cudaError_t {{.Field}}_poseidon_hash_many_cuda(
  unsigned int number_of_states,
  unsigned int input_block_len,
  unsigned int output_len,
-  SpongeConfig* cfg);
+  HashConfig* cfg);

 cudaError_t {{.Field}}_poseidon_delete_cuda(PoseidonInst* poseidon);

--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon_test.go.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon_test.go.tmpl
@@ -19,7 +19,7 @@ func TestPoseidon(t *testing.T) {
 	p, err := poseidon.Load(uint32(arity), &ctx)
 	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	cfg := p.GetDefaultSpongeConfig()
+	cfg := p.GetDefaultHashConfig()

 	scalars := {{.Field}}.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
--- a/wrappers/rust/icicle-core/src/hash.rs
+++ b/wrappers/rust/icicle-core/src/hash.rs
@@ -11,44 +11,28 @@ use crate::ntt::IcicleResult;
 /// Struct that encodes Sponge hash parameters.
 #[repr(C)]
 #[derive(Debug, Clone)]
-pub struct SpongeConfig<'a> {
+pub struct HashConfig<'a> {
    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
    pub ctx: DeviceContext<'a>,
-    pub(crate) are_inputs_on_device: bool,
-    pub(crate) are_outputs_on_device: bool,
-    pub input_rate: u32,
-    pub output_rate: u32,
-    pub offset: u32,
-
-    /// If true - input should be already aligned for poseidon permutation.
-    /// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
-    /// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
-    pub recursive_squeeze: bool,
-
-    /// If true, hash results will also be copied in the input pointer in aligned format
-    pub aligned: bool,
+    pub are_inputs_on_device: bool,
+    pub are_outputs_on_device: bool,
    /// Whether to run the sponge operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
    /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
    pub is_async: bool,
 }

-impl<'a> Default for SpongeConfig<'a> {
+impl<'a> Default for HashConfig<'a> {
    fn default() -> Self {
        Self::default_for_device(DEFAULT_DEVICE_ID)
    }
 }

-impl<'a> SpongeConfig<'a> {
+impl<'a> HashConfig<'a> {
    pub(crate) fn default_for_device(device_id: usize) -> Self {
-        SpongeConfig {
+        HashConfig {
            ctx: DeviceContext::default_for_device(device_id),
            are_inputs_on_device: false,
            are_outputs_on_device: false,
-            input_rate: 0,
-            output_rate: 0,
-            offset: 0,
-            recursive_squeeze: false,
-            aligned: false,
            is_async: false,
        }
    }
@@ -62,10 +46,10 @@ pub trait SpongeHash<PreImage, Image> {
        number_of_states: usize,
        input_block_len: usize,
        output_len: usize,
-        cfg: &SpongeConfig,
+        cfg: &HashConfig,
    ) -> IcicleResult<()>;

-    fn default_config<'a>(&self) -> SpongeConfig<'a>;
+    fn default_config<'a>(&self) -> HashConfig<'a>;

    fn get_handle(&self) -> *const c_void;
 }
--- a/wrappers/rust/icicle-core/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon/mod.rs
@@ -7,7 +7,7 @@ use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSli

 use crate::{
    error::IcicleResult,
-    hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
+    hash::{sponge_check_input, sponge_check_outputs, HashConfig, SpongeHash},
    traits::FieldImpl,
 };

@@ -87,7 +87,7 @@ where
        number_of_states: usize,
        input_block_len: usize,
        output_len: usize,
-        cfg: &SpongeConfig,
+        cfg: &HashConfig,
    ) -> IcicleResult<()> {
        sponge_check_input(inputs, number_of_states, input_block_len, self.width - 1, &cfg.ctx);
        sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);
@@ -107,11 +107,8 @@ where
        )
    }

-    fn default_config<'a>(&self) -> SpongeConfig<'a> {
-        let mut cfg = SpongeConfig::default();
-        cfg.input_rate = self.width as u32 - 1;
-        cfg.output_rate = self.width as u32;
-        cfg
+    fn default_config<'a>(&self) -> HashConfig<'a> {
+        HashConfig::default()
    }
 }

@@ -148,7 +145,7 @@ pub trait PoseidonImpl<F: FieldImpl> {
        input_block_len: u32,
        output_len: u32,
        poseidon: PoseidonHandle,
-        cfg: &SpongeConfig,
+        cfg: &HashConfig,
    ) -> IcicleResult<()>;

    fn delete(poseidon: PoseidonHandle) -> IcicleResult<()>;
@@ -163,7 +160,7 @@ macro_rules! impl_poseidon {
      $field_config:ident
    ) => {
        mod $field_prefix_ident {
-            use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, PoseidonHandle, SpongeConfig};
+            use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, HashConfig, PoseidonHandle};
            extern "C" {
                #[link_name = concat!($field_prefix, "_poseidon_create_cuda")]
                pub(crate) fn create(
@@ -194,7 +191,7 @@ macro_rules! impl_poseidon {
                    number_of_states: u32,
                    input_block_len: u32,
                    output_len: u32,
-                    cfg: &SpongeConfig,
+                    cfg: &HashConfig,
                ) -> CudaError;
            }
        }
@@ -248,7 +245,7 @@ macro_rules! impl_poseidon {
                input_block_len: u32,
                output_len: u32,
                poseidon: PoseidonHandle,
-                cfg: &SpongeConfig,
+                cfg: &HashConfig,
            ) -> IcicleResult<()> {
                unsafe {
                    $field_prefix_ident::hash_many(
--- a/wrappers/rust/icicle-core/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon2/mod.rs
@@ -7,7 +7,7 @@ use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSli

 use crate::{
    error::IcicleResult,
-    hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
+    hash::{sponge_check_input, sponge_check_outputs, HashConfig, SpongeHash},
    traits::FieldImpl,
 };

@@ -32,6 +32,7 @@ where
    <F as FieldImpl>::Config: Poseidon2Impl<F>,
 {
    width: usize,
+    rate: usize,
    handle: Poseidon2Handle,
    phantom: PhantomData<F>,
 }
@@ -52,6 +53,7 @@ where
            .and_then(|handle| {
                Ok(Self {
                    width,
+                    rate,
                    handle,
                    phantom: PhantomData,
                })
@@ -85,6 +87,7 @@ where
        .and_then(|handle| {
            Ok(Self {
                width,
+                rate,
                handle,
                phantom: PhantomData,
            })
@@ -108,15 +111,9 @@ where
        number_of_states: usize,
        input_block_len: usize,
        output_len: usize,
-        cfg: &SpongeConfig,
+        cfg: &HashConfig,
    ) -> IcicleResult<()> {
-        sponge_check_input(
-            inputs,
-            number_of_states,
-            input_block_len,
-            cfg.input_rate as usize,
-            &cfg.ctx,
-        );
+        sponge_check_input(inputs, number_of_states, input_block_len, self.rate, &cfg.ctx);
        sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);

        let mut local_cfg = cfg.clone();
@@ -134,11 +131,8 @@ where
        )
    }

-    fn default_config<'a>(&self) -> SpongeConfig<'a> {
-        let mut cfg = SpongeConfig::default();
-        cfg.input_rate = self.width as u32;
-        cfg.output_rate = self.width as u32;
-        cfg
+    fn default_config<'a>(&self) -> HashConfig<'a> {
+        HashConfig::default()
    }
 }

@@ -181,7 +175,7 @@ pub trait Poseidon2Impl<F: FieldImpl> {
        input_block_len: u32,
        output_len: u32,
        poseidon: Poseidon2Handle,
-        cfg: &SpongeConfig,
+        cfg: &HashConfig,
    ) -> IcicleResult<()>;

    fn delete(poseidon: Poseidon2Handle) -> IcicleResult<()>;
@@ -197,8 +191,8 @@ macro_rules! impl_poseidon2 {
    ) => {
        mod $field_prefix_ident {
            use crate::poseidon2::{
-                $field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Handle,
-                SpongeConfig,
+                $field, $field_config, CudaError, DeviceContext, DiffusionStrategy, HashConfig, MdsType,
+                Poseidon2Handle,
            };
            use icicle_core::error::IcicleError;
            extern "C" {
@@ -238,7 +232,7 @@ macro_rules! impl_poseidon2 {
                    number_of_states: u32,
                    input_block_len: u32,
                    output_len: u32,
-                    cfg: &SpongeConfig,
+                    cfg: &HashConfig,
                ) -> CudaError;
            }
        }
@@ -298,7 +292,7 @@ macro_rules! impl_poseidon2 {
                input_block_len: u32,
                output_len: u32,
                poseidon: Poseidon2Handle,
-                cfg: &SpongeConfig,
+                cfg: &HashConfig,
            ) -> IcicleResult<()> {
                unsafe {
                    $field_prefix_ident::hash_many(
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/src/poseidon/mod.rs
@@ -3,7 +3,7 @@ use crate::curve::{BaseCfg, BaseField};
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon;
 use icicle_core::poseidon::{PoseidonHandle, PoseidonImpl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/src/poseidon/mod.rs
@@ -1,7 +1,7 @@
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon;
 use icicle_core::poseidon::{PoseidonHandle, PoseidonImpl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon/mod.rs
@@ -1,7 +1,7 @@
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon;
 use icicle_core::poseidon::{PoseidonHandle, PoseidonImpl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon2/mod.rs
@@ -1,7 +1,7 @@
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon2;
 use icicle_core::poseidon2::{DiffusionStrategy, MdsType, Poseidon2Handle, Poseidon2Impl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-curves/icicle-grumpkin/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-grumpkin/src/poseidon/mod.rs
@@ -1,7 +1,7 @@
 use crate::curve::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon;
 use icicle_core::poseidon::{PoseidonHandle, PoseidonImpl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs
@@ -1,7 +1,7 @@
 use crate::field::{ScalarCfg, ScalarField};

 use icicle_core::error::IcicleResult;
-use icicle_core::hash::SpongeConfig;
+use icicle_core::hash::HashConfig;
 use icicle_core::impl_poseidon2;
 use icicle_core::poseidon2::{DiffusionStrategy, MdsType, Poseidon2Handle, Poseidon2Impl};
 use icicle_core::traits::IcicleResultWrap;
--- a/wrappers/rust/icicle-hash/src/keccak/mod.rs
+++ b/wrappers/rust/icicle-hash/src/keccak/mod.rs
@@ -1,84 +1,64 @@
+use icicle_core::hash::HashConfig;
+use icicle_core::tree::TreeBuilderConfig;
 use icicle_cuda_runtime::error::CudaError;
-use icicle_cuda_runtime::{
-    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
-    memory::HostOrDeviceSlice,
-};
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;

 use icicle_core::error::IcicleResult;
 use icicle_core::traits::IcicleResultWrap;

 pub mod tests;

-#[repr(C)]
-#[derive(Debug, Clone)]
-pub struct KeccakConfig<'a> {
-    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-    pub ctx: DeviceContext<'a>,
-
-    /// True if inputs are on device and false if they're on host. Default value: false.
-    pub are_inputs_on_device: bool,
-
-    /// If true, output is preserved on device, otherwise on host. Default value: false.
-    pub are_outputs_on_device: bool,
-
-    /// Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
-    /// non-blocking and you'd need to synchronize it explicitly by running
-    /// `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
-    /// function will block the current CPU thread.
-    pub is_async: bool,
-}
-
-impl<'a> Default for KeccakConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
-}
-
-impl<'a> KeccakConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        KeccakConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            are_inputs_on_device: false,
-            are_outputs_on_device: false,
-            is_async: false,
-        }
-    }
-}
-
 extern "C" {
    pub(crate) fn keccak256_cuda(
        input: *const u8,
-        input_block_size: i32,
-        number_of_blocks: i32,
+        input_block_size: u32,
+        number_of_blocks: u32,
        output: *mut u8,
-        config: &KeccakConfig,
+        config: &HashConfig,
    ) -> CudaError;

    pub(crate) fn keccak512_cuda(
        input: *const u8,
-        input_block_size: i32,
-        number_of_blocks: i32,
+        input_block_size: u32,
+        number_of_blocks: u32,
        output: *mut u8,
-        config: &KeccakConfig,
+        config: &HashConfig,
+    ) -> CudaError;
+
+    pub(crate) fn build_keccak256_merkle_tree_cuda(
+        leaves: *const u8,
+        digests: *mut u64,
+        height: u32,
+        input_block_len: u32,
+        config: &TreeBuilderConfig,
+    ) -> CudaError;
+
+    pub(crate) fn build_keccak512_merkle_tree_cuda(
+        leaves: *const u8,
+        digests: *mut u64,
+        height: u32,
+        input_block_len: u32,
+        config: &TreeBuilderConfig,
    ) -> CudaError;
 }

 pub fn keccak256(
    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
-    input_block_size: i32,
-    number_of_blocks: i32,
+    input_block_size: u32,
+    number_of_blocks: u32,
    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
-    config: &mut KeccakConfig,
+    config: &HashConfig,
 ) -> IcicleResult<()> {
-    config.are_inputs_on_device = input.is_on_device();
-    config.are_outputs_on_device = output.is_on_device();
+    let mut local_cfg = config.clone();
+    local_cfg.are_inputs_on_device = input.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
    unsafe {
        keccak256_cuda(
            input.as_ptr(),
            input_block_size,
            number_of_blocks,
            output.as_mut_ptr(),
-            config,
+            &local_cfg,
        )
        .wrap()
    }
@@ -86,19 +66,58 @@ pub fn keccak256(

 pub fn keccak512(
    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
-    input_block_size: i32,
-    number_of_blocks: i32,
+    input_block_size: u32,
+    number_of_blocks: u32,
    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
-    config: &mut KeccakConfig,
+    config: &HashConfig,
 ) -> IcicleResult<()> {
-    config.are_inputs_on_device = input.is_on_device();
-    config.are_outputs_on_device = output.is_on_device();
+    let mut local_cfg = config.clone();
+    local_cfg.are_inputs_on_device = input.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
    unsafe {
        keccak512_cuda(
            input.as_ptr(),
            input_block_size,
            number_of_blocks,
            output.as_mut_ptr(),
+            &local_cfg,
+        )
+        .wrap()
+    }
+}
+
+pub fn build_keccak256_merkle_tree(
+    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
+    height: usize,
+    input_block_len: usize,
+    config: &TreeBuilderConfig,
+) -> IcicleResult<()> {
+    unsafe {
+        build_keccak256_merkle_tree_cuda(
+            leaves.as_ptr(),
+            digests.as_mut_ptr(),
+            height as u32,
+            input_block_len as u32,
+            config,
+        )
+        .wrap()
+    }
+}
+
+pub fn build_keccak512_merkle_tree(
+    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
+    height: usize,
+    input_block_len: usize,
+    config: &TreeBuilderConfig,
+) -> IcicleResult<()> {
+    unsafe {
+        build_keccak512_merkle_tree_cuda(
+            leaves.as_ptr(),
+            digests.as_mut_ptr(),
+            height as u32,
+            input_block_len as u32,
            config,
        )
        .wrap()
--- a/wrappers/rust/icicle-hash/src/keccak/tests.rs
+++ b/wrappers/rust/icicle-hash/src/keccak/tests.rs
@@ -1 +1,48 @@
+#[cfg(test)]
+pub(crate) mod tests {
+    use icicle_core::{
+        hash::HashConfig,
+        tree::{merkle_tree_digests_len, TreeBuilderConfig},
+    };
+    use icicle_cuda_runtime::memory::HostSlice;

+    use crate::keccak::{build_keccak256_merkle_tree, keccak256};
+
+    #[test]
+    fn keccak_hash_test() {
+        let config = HashConfig::default();
+        let input_block_len = 136;
+        let number_of_hashes = 1024;
+
+        let preimages = vec![1u8; number_of_hashes * input_block_len];
+        let mut digests = vec![0u8; number_of_hashes * 64];
+
+        let preimages_slice = HostSlice::from_slice(&preimages);
+        let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+        keccak256(
+            preimages_slice,
+            input_block_len as u32,
+            number_of_hashes as u32,
+            digests_slice,
+            &config,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn keccak_merkle_tree_test() {
+        let mut config = TreeBuilderConfig::default();
+        config.arity = 2;
+        let height = 22;
+        let input_block_len = 136;
+        let leaves = vec![1u8; (1 << height) * input_block_len];
+        let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
+
+        let leaves_slice = HostSlice::from_slice(&leaves);
+        let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+        build_keccak256_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
+        println!("Root: {:?}", digests_slice[0]);
+    }
+}