Feat/roman/tree builder (#525)

# Updates: ## Hashing - Added SpongeHasher class - Can be used to accept any hash function as an argument - Absorb and squeeze are now separated - Memory management is now mostly done by SpongeHasher class, each hash function only describes permutation kernels ## Tree builder - Tree builder is now hash-agnostic. - Tree builder now supports 2D input (matrices) - Tree builder can now use two different hash functions for layer 0 and compression layers ## Poseidon1 - Interface changed to classes - Now allows for any alpha - Now allows passing constants not in a single vector - Now allows for any domain tag - Constants are now released upon going out of scope - Rust wrappers changed to Poseidon struct ## Poseidon2 - Interface changed to classes - Constants are now released upon going out of scope - Rust wrappers changed to Poseidon2 struct ## Keccak - Added Keccak class which inherits SpongeHasher - Now doesn't use gpu registers for storing states To do: - [x] Update poseidon1 golang bindings - [x] Update poseidon1 examples - [x] Fix poseidon2 cuda test - [x] Fix poseidon2 merkle tree builder test - [x] Update keccak class with new design - [x] Update keccak test - [x] Check keccak correctness - [x] Update tree builder rust wrappers - [x] Leave doc comments Future work: - [ ] Add keccak merkle tree builder externs - [ ] Add keccak rust tree builder wrappers - [ ] Write docs - [ ] Add example - [ ] Fix device output for tree builder --------- Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com> Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com>
2026-01-06 22:24:06 -05:00 · 2024-07-11 13:46:25 +07:00
parent 2d4059c61f
commit 7fd9ed1b49
125 changed files with 8002 additions and 4097 deletions
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -6,6 +6,9 @@
 #include "api/bn254.h"
 #include "gpu-utils/error_handler.cuh"

+#include "poseidon/poseidon.cuh"
+#include "hash/hash.cuh"
+
 using namespace poseidon;
 using namespace bn254;

@@ -20,31 +23,20 @@ void checkCudaError(cudaError_t error)
 // these global constants go into template calls
 const int size_col = 11;

-// this function executes the Poseidon thread
 void threadPoseidon(
  device_context::DeviceContext ctx,
  unsigned size_partition,
  scalar_t* layers,
  scalar_t* column_hashes,
-  PoseidonConstants<scalar_t>* constants)
+  Poseidon<scalar_t> * poseidon)
 {
  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
  if (err_result != cudaSuccess) {
    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
    return;
  }
-  // CHK_IF_RETURN(); I can't use it in a standard thread function
-  PoseidonConfig column_config = {
-    ctx,   // ctx
-    false, // are_inputes_on_device
-    false, // are_outputs_on_device
-    false, // input_is_a_state
-    false, // aligned
-    false, // loop_state
-    false, // is_async
-  };
-  cudaError_t err =
-    bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
+  SpongeConfig column_config = default_sponge_config(ctx);
+  cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
  checkCudaError(err);
 }

@@ -59,6 +51,11 @@ using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::p
    exit(EXIT_FAILURE);                                                                                                \
  }

+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
 int main()
 {
  const unsigned size_row = (1 << 30);
@@ -116,19 +113,18 @@ int main()
  scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
  CHECK_ALLOC(column_hash1);

-  PoseidonConstants<scalar_t> column_constants0, column_constants1;
-  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
-  cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
-  if (err_result != cudaSuccess) {
-    std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
-    return;
-  }
-  bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
+    Poseidon<scalar_t> column_poseidon0(size_col, ctx0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    Poseidon<scalar_t> column_poseidon1(size_col, ctx1);

  std::cout << "Parallel execution of Poseidon threads" << std::endl;
  START_TIMER(parallel);
-  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
-  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+  std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
+  std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);

  // Wait for the threads to finish
  thread0.join();
@@ -141,9 +137,9 @@ int main()

  std::cout << "Sequential execution of Poseidon threads" << std::endl;
  START_TIMER(sequential);
-  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+  std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
  thread2.join();
-  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+  std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
  thread3.join();
  END_TIMER(sequential, "1 GPU");
  std::cout << "Output Data from Thread 2: ";
--- a/examples/c++/polynomial-api/example.cu
+++ b/examples/c++/polynomial-api/example.cu
@@ -3,13 +3,11 @@
 #include "polynomials/polynomials.h"
 #include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
 #include "ntt/ntt.cuh"
-#include "poseidon/tree/merkle.cuh"
+
 #include "api/bn254.h"
 #include <chrono>

-// using namespace field_config;
 using namespace polynomials;
-using namespace merkle;
 using namespace bn254;

 // define the polynomial type
--- a/examples/c++/poseidon/example.cu
+++ b/examples/c++/poseidon/example.cu
@@ -4,6 +4,8 @@

 #include "api/bn254.h"
 #include "curves/params/bn254.cuh"
+#include "poseidon/poseidon.cuh"
+#include "hash/hash.cuh"
 using namespace poseidon;
 using namespace bn254;

@@ -14,13 +16,12 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level

 // We assume the tree has leaves already set, compute all other levels
 void build_tree(
-  const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
+  const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, SpongeConfig &config)
 {
  for (uint32_t level = tree_height - 1; level > 0; level--) {
    const uint32_t next_level = level - 1;
    const uint32_t next_level_width = 1 << next_level;
-    bn254_poseidon_hash_cuda(
-      &tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
+    poseidon.hash_many(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, 1, config);
  }
 }

@@ -65,8 +66,8 @@ uint32_t validate_proof(
  const uint32_t tree_height,
  const uint32_t* proof_lr,
  const scalar_t* proof_hash,
-  PoseidonConstants<scalar_t>* constants,
-  PoseidonConfig config)
+  Poseidon<scalar_t> &poseidon,
+  SpongeConfig &config)
 {
  scalar_t hashes_in[2], hash_out[1], level_hash;
  level_hash = hash;
@@ -79,7 +80,7 @@ uint32_t validate_proof(
      hashes_in[1] = level_hash;
    }
    // next level hash
-    bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
+    poseidon.hash_many(hashes_in, hash_out, 1, 2, 1, config);
    level_hash = hash_out[0];
  }
  return proof_hash[0] == level_hash;
@@ -109,16 +110,15 @@ int main(int argc, char* argv[])
    d = d + scalar_t::one();
  }
  std::cout << "Hashing blocks into tree leaves..." << std::endl;
-  PoseidonConstants<scalar_t> constants;
-  bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
-  PoseidonConfig config = default_poseidon_config(data_arity + 1);
-  bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
+
+  Poseidon<scalar_t> poseidon(data_arity, ctx);
+  SpongeConfig config = default_sponge_config(ctx); 
+  poseidon.hash_many(data, &tree[tree_index(leaf_level, 0)], tree_width, data_arity, 1, config);

  std::cout << "3. Building Merkle tree" << std::endl;
-  PoseidonConstants<scalar_t> tree_constants;
-  bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
-  PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
-  build_tree(tree_height, tree, &tree_constants, tree_config);
+  Poseidon<scalar_t> tree_poseidon(tree_arity, ctx);
+  SpongeConfig tree_config = default_sponge_config(ctx);
+  build_tree(tree_height, tree, tree_poseidon, tree_config);

  std::cout << "4. Generate membership proof" << std::endl;
  uint32_t position = tree_width - 1;
@@ -133,13 +133,13 @@ int main(int argc, char* argv[])
  std::cout << "5. Validate the hash membership" << std::endl;
  uint32_t validated;
  const scalar_t hash = tree[tree_index(leaf_level, query_position)];
-  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
+  validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
  std::cout << "Validated: " << validated << std::endl;

  std::cout << "6. Tamper the hash" << std::endl;
  const scalar_t tampered_hash = hash + scalar_t::one();
-  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
-
+  validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
+  
  std::cout << "7. Invalidate tamper hash membership" << std::endl;
  std::cout << "Validated: " << validated << std::endl;
  return 0;
--- a/examples/rust/poseidon/src/main.rs
+++ b/examples/rust/poseidon/src/main.rs
@@ -2,7 +2,8 @@ use icicle_bls12_381::curve::ScalarField as F;

 use icicle_cuda_runtime::device_context::DeviceContext;

-use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
+use icicle_core::hash::{SpongeHash, SpongeConfig};
+use icicle_core::poseidon::Poseidon;
 use icicle_core::traits::FieldImpl;
 use icicle_cuda_runtime::memory::HostSlice;

@@ -24,14 +25,14 @@ fn main() {
    let test_size = 1 << size;

    println!("Running Icicle Examples: Rust Poseidon Hash");
-    let arity = 2u32;
+    let arity = 2;
    println!(
        "---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
        arity
    );
    let ctx = DeviceContext::default();
-    let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
-    let config = PoseidonConfig::default();
+    let poseidon = Poseidon::load(arity, &ctx).unwrap();
+    let config = SpongeConfig::default();

    println!(
        "---------------------- Input size 2^{}={} ------------------------",
@@ -45,12 +46,12 @@ fn main() {
    println!("Executing BLS12-381 Poseidon Hash on device...");
    #[cfg(feature = "profile")]
    let start = Instant::now();
-    poseidon_hash_many::<F>(
+    poseidon.hash_many(
        input_slice,
        output_slice,
-        test_size as u32,
-        arity as u32,
-        &constants,
+        test_size,
+        arity,
+        1,
        &config,
    )
    .unwrap();
--- a/icicle/include/api/babybear.h
+++ b/icicle/include/api/babybear.h
@@ -9,58 +9,67 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "fields/stark_fields/babybear.cuh"
 #include "ntt/ntt.cuh"
 #include "vec_ops/vec_ops.cuh"
-#include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
 #include "poseidon2/poseidon2.cuh"

 extern "C" cudaError_t babybear_extension_ntt_cuda(
  const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);

-extern "C" cudaError_t babybear_initialize_domain(
-  babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t babybear_poseidon2_create_cuda(
+  poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
+  const babybear::scalar_t* round_constants,
+  const babybear::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t babybear_ntt_cuda(
-  const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::scalar_t* output);
+extern "C" cudaError_t babybear_poseidon2_load_cuda(
+  poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx);
+extern "C" cudaError_t babybear_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<babybear::scalar_t>* poseidon,
+  const babybear::scalar_t* inputs,
+  babybear::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);

-extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
+extern "C" cudaError_t
+  babybear_poseidon2_delete_cuda(poseidon2::Poseidon2<babybear::scalar_t>* poseidon, device_context::DeviceContext& ctx);

-extern "C" cudaError_t babybear_scalar_convert_montgomery(
-  babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+extern "C" cudaError_t babybear_build_merkle_tree(
+  const babybear::scalar_t* leaves,
+  babybear::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
+  const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t babybear_extension_mul_cuda(
-  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
-
-extern "C" cudaError_t babybear_extension_add_cuda(
-  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
-
-extern "C" cudaError_t babybear_extension_accumulate_cuda(
-  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
-
-extern "C" cudaError_t babybear_extension_sub_cuda(
-  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
-
-extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
-  const babybear::extension_t* input,
-  uint32_t row_size,
-  uint32_t column_size,
-  babybear::extension_t* output,
-  device_context::DeviceContext& ctx,
-  bool on_device,
-  bool is_async);
-
-extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
-  const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);
-
-
-extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
-
-extern "C" cudaError_t babybear_extension_scalar_convert_montgomery(
-  babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+  extern "C" cudaError_t babybear_mmcs_commit_cuda(
+    const matrix::Matrix<babybear::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    babybear::scalar_t* digests,
+    const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* hasher,
+    const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t babybear_mul_cuda(
  babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
@@ -87,35 +96,47 @@ extern "C" cudaError_t babybear_bit_reverse_cuda(
  const babybear::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output);


-extern "C" cudaError_t babybear_create_poseidon2_constants_cuda(
-  int width,
-  int alpha,
-  int internal_rounds,
-  int external_rounds,
-  const babybear::scalar_t* round_constants,
-  const babybear::scalar_t* internal_matrix_diag,
-  poseidon2::MdsType mds_type,
-  poseidon2::DiffusionStrategy diffusion,
+extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
+
+extern "C" cudaError_t babybear_scalar_convert_montgomery(
+  babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t babybear_initialize_domain(
+  babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t babybear_ntt_cuda(
+  const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::scalar_t* output);
+
+extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx);
+
+extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
+
+extern "C" cudaError_t babybear_extension_scalar_convert_montgomery(
+  babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t babybear_extension_mul_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_add_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_accumulate_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
+extern "C" cudaError_t babybear_extension_sub_cuda(
+  babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
+
+extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
+  const babybear::extension_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  babybear::extension_t* output,
  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
+  bool on_device,
+  bool is_async);

-extern "C" cudaError_t babybear_init_poseidon2_constants_cuda(
-  int width,
-  poseidon2::MdsType mds_type,
-  poseidon2::DiffusionStrategy diffusion,
-  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
+extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
+  const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);

-extern "C" cudaError_t babybear_poseidon2_hash_cuda(
-  const babybear::scalar_t* input,
-  babybear::scalar_t* output,
-  int number_of_states,
-  int width,
-  const poseidon2::Poseidon2Constants<babybear::scalar_t>& constants,
-  poseidon2::Poseidon2Config& config);
-
-extern "C" cudaError_t babybear_release_poseidon2_constants_cuda(
-  poseidon2::Poseidon2Constants<babybear::scalar_t>* constants,
-  device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/bls12_377.h
+++ b/icicle/include/api/bls12_377.h
@@ -9,26 +9,13 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bls12_377.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
-
-extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
-
-extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
-
-extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
-
-extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
-
-extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery(
-  bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
-extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery(
-  bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

 extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
  bls12_377::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
 extern "C" cudaError_t bls12_377_msm_cuda(
  const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_377::projective_t* out);

+extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
+
+extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
+
+extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
+
+extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery(
+  bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery(
+  bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t bls12_377_ecntt_cuda(
  const bls12_377::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::projective_t* output);

@@ -65,18 +66,52 @@ extern "C" cudaError_t bls12_377_affine_convert_montgomery(
 extern "C" cudaError_t bls12_377_projective_convert_montgomery(
  bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_377_initialize_domain(
-  bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t bls12_377_build_merkle_tree(
+  const bls12_377::scalar_t* leaves,
+  bls12_377::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+  const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bls12_377_ntt_cuda(
-  const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::scalar_t* output);
+  extern "C" cudaError_t bls12_377_mmcs_commit_cuda(
+    const matrix::Matrix<bls12_377::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bls12_377::scalar_t* digests,
+    const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
+    const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx);
+extern "C" cudaError_t bls12_377_poseidon_create_cuda(
+  poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bls12_377::scalar_t* round_constants,
+  const bls12_377::scalar_t* mds_matrix,
+  const bls12_377::scalar_t* non_sparse_matrix,
+  const bls12_377::scalar_t* sparse_matrices,
+  const bls12_377::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);

-extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
+extern "C" cudaError_t bls12_377_poseidon_load_cuda(
+  poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
-  bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+extern "C" cudaError_t bls12_377_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bls12_377::scalar_t>* poseidon,
+  const bls12_377::scalar_t* inputs,
+  bls12_377::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);
+
+extern "C" cudaError_t
+  bls12_377_poseidon_delete_cuda(poseidon::Poseidon<bls12_377::scalar_t>* poseidon);

 extern "C" cudaError_t bls12_377_mul_cuda(
  bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda(
  const bls12_377::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output);


-extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bls12_377::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bls12_377::scalar_t>* poseidon_constants);
+extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);

-extern "C" cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_377::scalar_t>* constants);
+extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
+  bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_377_poseidon_hash_cuda(
-  bls12_377::scalar_t* input,
-  bls12_377::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+extern "C" cudaError_t bls12_377_initialize_domain(
+  bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);

-extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree(
-  const bls12_377::scalar_t* leaves,
-  bls12_377::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t bls12_377_ntt_cuda(
+  const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::scalar_t* output);
+
+extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/bls12_381.h
+++ b/icicle/include/api/bls12_381.h
@@ -9,26 +9,13 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bls12_381.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
-
-extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
-
-extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
-
-extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
-
-extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
-
-extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery(
-  bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
-extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery(
-  bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

 extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
  bls12_381::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
 extern "C" cudaError_t bls12_381_msm_cuda(
  const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_381::projective_t* out);

+extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
+
+extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
+
+extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
+
+extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery(
+  bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery(
+  bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t bls12_381_ecntt_cuda(
  const bls12_381::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::projective_t* output);

@@ -65,18 +66,52 @@ extern "C" cudaError_t bls12_381_affine_convert_montgomery(
 extern "C" cudaError_t bls12_381_projective_convert_montgomery(
  bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_381_initialize_domain(
-  bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t bls12_381_build_merkle_tree(
+  const bls12_381::scalar_t* leaves,
+  bls12_381::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+  const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bls12_381_ntt_cuda(
-  const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::scalar_t* output);
+  extern "C" cudaError_t bls12_381_mmcs_commit_cuda(
+    const matrix::Matrix<bls12_381::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bls12_381::scalar_t* digests,
+    const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
+    const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx);
+extern "C" cudaError_t bls12_381_poseidon_create_cuda(
+  poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bls12_381::scalar_t* round_constants,
+  const bls12_381::scalar_t* mds_matrix,
+  const bls12_381::scalar_t* non_sparse_matrix,
+  const bls12_381::scalar_t* sparse_matrices,
+  const bls12_381::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);

-extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
+extern "C" cudaError_t bls12_381_poseidon_load_cuda(
+  poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
-  bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+extern "C" cudaError_t bls12_381_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bls12_381::scalar_t>* poseidon,
+  const bls12_381::scalar_t* inputs,
+  bls12_381::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);
+
+extern "C" cudaError_t
+  bls12_381_poseidon_delete_cuda(poseidon::Poseidon<bls12_381::scalar_t>* poseidon);

 extern "C" cudaError_t bls12_381_mul_cuda(
  bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda(
  const bls12_381::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output);


-extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bls12_381::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bls12_381::scalar_t>* poseidon_constants);
+extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);

-extern "C" cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_381::scalar_t>* constants);
+extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
+  bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bls12_381_poseidon_hash_cuda(
-  bls12_381::scalar_t* input,
-  bls12_381::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+extern "C" cudaError_t bls12_381_initialize_domain(
+  bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);

-extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree(
-  const bls12_381::scalar_t* leaves,
-  bls12_381::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t bls12_381_ntt_cuda(
+  const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::scalar_t* output);
+
+extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/bn254.h
+++ b/icicle/include/api/bn254.h
@@ -9,28 +9,15 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bn254.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
 #include "poseidon2/poseidon2.cuh"

-extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
-
-extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
-
-extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
-
-extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
-
-extern "C" cudaError_t bn254_g2_affine_convert_montgomery(
-  bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
-extern "C" cudaError_t bn254_g2_projective_convert_montgomery(
-  bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
 extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
  bn254::g2_affine_t* bases,
  int msm_size,
@@ -49,6 +36,20 @@ extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
 extern "C" cudaError_t bn254_msm_cuda(
  const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, msm::MSMConfig& config, bn254::projective_t* out);

+extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
+
+extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
+
+extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
+
+extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bn254_g2_affine_convert_montgomery(
+  bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_g2_projective_convert_montgomery(
+  bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t bn254_ecntt_cuda(
  const bn254::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::projective_t* output);

@@ -66,18 +67,87 @@ extern "C" cudaError_t bn254_affine_convert_montgomery(
 extern "C" cudaError_t bn254_projective_convert_montgomery(
  bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bn254_initialize_domain(
-  bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t bn254_poseidon2_create_cuda(
+  poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
+  const bn254::scalar_t* round_constants,
+  const bn254::scalar_t* internal_matrix_diag,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t bn254_ntt_cuda(
-  const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::scalar_t* output);
+extern "C" cudaError_t bn254_poseidon2_load_cuda(
+  poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  poseidon2::MdsType mds_type,
+  poseidon2::DiffusionStrategy diffusion,
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx);
+extern "C" cudaError_t bn254_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<bn254::scalar_t>* poseidon,
+  const bn254::scalar_t* inputs,
+  bn254::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);

-extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
+extern "C" cudaError_t
+  bn254_poseidon2_delete_cuda(poseidon2::Poseidon2<bn254::scalar_t>* poseidon, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bn254_scalar_convert_montgomery(
-  bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+extern "C" cudaError_t bn254_build_merkle_tree(
+  const bn254::scalar_t* leaves,
+  bn254::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
+  const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t bn254_mmcs_commit_cuda(
+    const matrix::Matrix<bn254::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bn254::scalar_t* digests,
+    const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* hasher,
+    const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t bn254_poseidon_create_cuda(
+  poseidon::Poseidon<bn254::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bn254::scalar_t* round_constants,
+  const bn254::scalar_t* mds_matrix,
+  const bn254::scalar_t* non_sparse_matrix,
+  const bn254::scalar_t* sparse_matrices,
+  const bn254::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_poseidon_load_cuda(
+  poseidon::Poseidon<bn254::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bn254_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bn254::scalar_t>* poseidon,
+  const bn254::scalar_t* inputs,
+  bn254::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);
+
+extern "C" cudaError_t
+  bn254_poseidon_delete_cuda(poseidon::Poseidon<bn254::scalar_t>* poseidon);

 extern "C" cudaError_t bn254_mul_cuda(
  bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
@@ -104,62 +174,17 @@ extern "C" cudaError_t bn254_bit_reverse_cuda(
  const bn254::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output);


-extern "C" cudaError_t bn254_create_poseidon2_constants_cuda(
-  int width,
-  int alpha,
-  int internal_rounds,
-  int external_rounds,
-  const bn254::scalar_t* round_constants,
-  const bn254::scalar_t* internal_matrix_diag,
-  poseidon2::MdsType mds_type,
-  poseidon2::DiffusionStrategy diffusion,
-  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
+extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);

-extern "C" cudaError_t bn254_init_poseidon2_constants_cuda(
-  int width,
-  poseidon2::MdsType mds_type,
-  poseidon2::DiffusionStrategy diffusion,
-  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
+extern "C" cudaError_t bn254_scalar_convert_montgomery(
+  bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bn254_poseidon2_hash_cuda(
-  const bn254::scalar_t* input,
-  bn254::scalar_t* output,
-  int number_of_states,
-  int width,
-  const poseidon2::Poseidon2Constants<bn254::scalar_t>& constants,
-  poseidon2::Poseidon2Config& config);
+extern "C" cudaError_t bn254_initialize_domain(
+  bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);

-extern "C" cudaError_t bn254_release_poseidon2_constants_cuda(
-  poseidon2::Poseidon2Constants<bn254::scalar_t>* constants,
-  device_context::DeviceContext& ctx);
+extern "C" cudaError_t bn254_ntt_cuda(
+  const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::scalar_t* output);

-extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bn254::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bn254::scalar_t>* poseidon_constants);
-
-extern "C" cudaError_t bn254_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bn254::scalar_t>* constants);
-
-extern "C" cudaError_t bn254_poseidon_hash_cuda(
-  bn254::scalar_t* input,
-  bn254::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bn254::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t bn254_build_poseidon_merkle_tree(
-  const bn254::scalar_t* leaves,
-  bn254::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bn254::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/bw6_761.h
+++ b/icicle/include/api/bw6_761.h
@@ -9,26 +9,13 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/bw6_761.cuh"
 #include "ntt/ntt.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"
-
-extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
-
-extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
-
-extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
-
-extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
-
-extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery(
-  bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
-extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery(
-  bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

 extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
  bw6_761::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
 extern "C" cudaError_t bw6_761_msm_cuda(
  const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, msm::MSMConfig& config, bw6_761::projective_t* out);

+extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
+
+extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
+
+extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
+
+extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
+
+extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery(
+  bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery(
+  bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
 extern "C" cudaError_t bw6_761_ecntt_cuda(
  const bw6_761::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::projective_t* output);

@@ -65,18 +66,52 @@ extern "C" cudaError_t bw6_761_affine_convert_montgomery(
 extern "C" cudaError_t bw6_761_projective_convert_montgomery(
  bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bw6_761_initialize_domain(
-  bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t bw6_761_build_merkle_tree(
+  const bw6_761::scalar_t* leaves,
+  bw6_761::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+  const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bw6_761_ntt_cuda(
-  const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::scalar_t* output);
+  extern "C" cudaError_t bw6_761_mmcs_commit_cuda(
+    const matrix::Matrix<bw6_761::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    bw6_761::scalar_t* digests,
+    const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
+    const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx);
+extern "C" cudaError_t bw6_761_poseidon_create_cuda(
+  poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const bw6_761::scalar_t* round_constants,
+  const bw6_761::scalar_t* mds_matrix,
+  const bw6_761::scalar_t* non_sparse_matrix,
+  const bw6_761::scalar_t* sparse_matrices,
+  const bw6_761::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);

-extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
+extern "C" cudaError_t bw6_761_poseidon_load_cuda(
+  poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
-  bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+extern "C" cudaError_t bw6_761_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<bw6_761::scalar_t>* poseidon,
+  const bw6_761::scalar_t* inputs,
+  bw6_761::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);
+
+extern "C" cudaError_t
+  bw6_761_poseidon_delete_cuda(poseidon::Poseidon<bw6_761::scalar_t>* poseidon);

 extern "C" cudaError_t bw6_761_mul_cuda(
  bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda(
  const bw6_761::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output);


-extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const bw6_761::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<bw6_761::scalar_t>* poseidon_constants);
+extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);

-extern "C" cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bw6_761::scalar_t>* constants);
+extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
+  bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" cudaError_t bw6_761_poseidon_hash_cuda(
-  bw6_761::scalar_t* input,
-  bw6_761::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+extern "C" cudaError_t bw6_761_initialize_domain(
+  bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);

-extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree(
-  const bw6_761::scalar_t* leaves,
-  bw6_761::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t bw6_761_ntt_cuda(
+  const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::scalar_t* output);
+
+extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/grumpkin.h
+++ b/icicle/include/api/grumpkin.h
@@ -9,11 +9,12 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "curves/params/grumpkin.cuh"
 #include "msm/msm.cuh"
 #include "vec_ops/vec_ops.cuh"
 #include "poseidon/poseidon.cuh"
-#include "poseidon/tree/merkle.cuh"

 extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
  grumpkin::affine_t* bases,
@@ -38,10 +39,52 @@ extern "C" cudaError_t grumpkin_affine_convert_montgomery(
 extern "C" cudaError_t grumpkin_projective_convert_montgomery(
  grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

-extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
+extern "C" cudaError_t grumpkin_build_merkle_tree(
+  const grumpkin::scalar_t* leaves,
+  grumpkin::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+  const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
-  grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+  extern "C" cudaError_t grumpkin_mmcs_commit_cuda(
+    const matrix::Matrix<grumpkin::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    grumpkin::scalar_t* digests,
+    const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
+    const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
+
+extern "C" cudaError_t grumpkin_poseidon_create_cuda(
+  poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const grumpkin::scalar_t* round_constants,
+  const grumpkin::scalar_t* mds_matrix,
+  const grumpkin::scalar_t* non_sparse_matrix,
+  const grumpkin::scalar_t* sparse_matrices,
+  const grumpkin::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_poseidon_load_cuda(
+  poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t grumpkin_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<grumpkin::scalar_t>* poseidon,
+  const grumpkin::scalar_t* inputs,
+  grumpkin::scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);
+
+extern "C" cudaError_t
+  grumpkin_poseidon_delete_cuda(poseidon::Poseidon<grumpkin::scalar_t>* poseidon);

 extern "C" cudaError_t grumpkin_mul_cuda(
  grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
@@ -68,31 +111,9 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda(
  const grumpkin::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output);


-extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const grumpkin::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<grumpkin::scalar_t>* poseidon_constants);
+extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);

-extern "C" cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<grumpkin::scalar_t>* constants);
-
-extern "C" cudaError_t grumpkin_poseidon_hash_cuda(
-  grumpkin::scalar_t* input,
-  grumpkin::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
-
-extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree(
-  const grumpkin::scalar_t* leaves,
-  grumpkin::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
+  grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);

 #endif
--- a/icicle/include/api/m31.h
+++ b/icicle/include/api/m31.h
@@ -9,43 +9,27 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "fields/stark_fields/m31.cuh"
 #include "vec_ops/vec_ops.cuh"

-extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
+extern "C" cudaError_t m31_build_merkle_tree(
+  const m31::scalar_t* leaves,
+  m31::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
+  const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t m31_scalar_convert_montgomery(
-  m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
-
-extern "C" cudaError_t m31_extension_mul_cuda(
-  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
-
-extern "C" cudaError_t m31_extension_add_cuda(
-  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
-
-extern "C" cudaError_t m31_extension_accumulate_cuda(
-  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
-
-extern "C" cudaError_t m31_extension_sub_cuda(
-  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
-
-extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
-  const m31::extension_t* input,
-  uint32_t row_size,
-  uint32_t column_size,
-  m31::extension_t* output,
-  device_context::DeviceContext& ctx,
-  bool on_device,
-  bool is_async);
-
-extern "C" cudaError_t m31_extension_bit_reverse_cuda(
-  const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
-
-
-extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
-
-extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
-  m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+  extern "C" cudaError_t m31_mmcs_commit_cuda(
+    const matrix::Matrix<m31::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    m31::scalar_t* digests,
+    const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* hasher,
+    const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t m31_mul_cuda(
  m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
@@ -72,4 +56,39 @@ extern "C" cudaError_t m31_bit_reverse_cuda(
  const m31::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::scalar_t* output);


+extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
+
+extern "C" cudaError_t m31_scalar_convert_montgomery(
+  m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
+
+extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
+  m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t m31_extension_mul_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_add_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_accumulate_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
+
+extern "C" cudaError_t m31_extension_sub_cuda(
+  m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
+
+extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
+  const m31::extension_t* input,
+  uint32_t row_size,
+  uint32_t column_size,
+  m31::extension_t* output,
+  device_context::DeviceContext& ctx,
+  bool on_device,
+  bool is_async);
+
+extern "C" cudaError_t m31_extension_bit_reverse_cuda(
+  const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
+
+
 #endif
--- a/icicle/include/api/stark252.h
+++ b/icicle/include/api/stark252.h
@@ -9,22 +9,28 @@

 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
 #include "fields/stark_fields/stark252.cuh"
 #include "ntt/ntt.cuh"
 #include "vec_ops/vec_ops.cuh"

-extern "C" cudaError_t stark252_initialize_domain(
-  stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+extern "C" cudaError_t stark252_build_merkle_tree(
+  const stark252::scalar_t* leaves,
+  stark252::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
+  const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);

-extern "C" cudaError_t stark252_ntt_cuda(
-  const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
-
-extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
-
-extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
-
-extern "C" cudaError_t stark252_scalar_convert_montgomery(
-  stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+  extern "C" cudaError_t stark252_mmcs_commit_cuda(
+    const matrix::Matrix<stark252::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    stark252::scalar_t* digests,
+    const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* hasher,
+    const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);

 extern "C" cudaError_t stark252_mul_cuda(
  stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
@@ -51,4 +57,17 @@ extern "C" cudaError_t stark252_bit_reverse_cuda(
  const stark252::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output);


+extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
+
+extern "C" cudaError_t stark252_scalar_convert_montgomery(
+  stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
+
+extern "C" cudaError_t stark252_initialize_domain(
+  stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
+
+extern "C" cudaError_t stark252_ntt_cuda(
+  const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
+
+extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
+
 #endif
--- a/icicle/include/api/templates/fields/poseidon.h
+++ b/icicle/include/api/templates/fields/poseidon.h
@@ -1,26 +1,29 @@
-extern "C" cudaError_t ${FIELD}_create_optimized_poseidon_constants_cuda(
-  int arity,
-  int full_rounds_half,
-  int partial_rounds,
-  const ${FIELD}::scalar_t* constants,
-  device_context::DeviceContext& ctx,
-  poseidon::PoseidonConstants<${FIELD}::scalar_t>* poseidon_constants);
+extern "C" cudaError_t ${FIELD}_poseidon_create_cuda(
+  poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const ${FIELD}::scalar_t* round_constants,
+  const ${FIELD}::scalar_t* mds_matrix,
+  const ${FIELD}::scalar_t* non_sparse_matrix,
+  const ${FIELD}::scalar_t* sparse_matrices,
+  const ${FIELD}::scalar_t domain_tag,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t ${FIELD}_init_optimized_poseidon_constants_cuda(
-  int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<${FIELD}::scalar_t>* constants);
+extern "C" cudaError_t ${FIELD}_poseidon_load_cuda(
+  poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
+  unsigned int arity,
+  device_context::DeviceContext& ctx);

-extern "C" cudaError_t ${FIELD}_poseidon_hash_cuda(
-  ${FIELD}::scalar_t* input,
+extern "C" cudaError_t ${FIELD}_poseidon_hash_many_cuda(
+  const poseidon::Poseidon<${FIELD}::scalar_t>* poseidon,
+  const ${FIELD}::scalar_t* inputs,
  ${FIELD}::scalar_t* output,
-  int number_of_states,
-  int arity,
-  const poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
-  poseidon::PoseidonConfig& config);
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);

-extern "C" cudaError_t ${FIELD}_build_poseidon_merkle_tree(
-  const ${FIELD}::scalar_t* leaves,
-  ${FIELD}::scalar_t* digests,
-  uint32_t height,
-  int arity,
-  poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
-  merkle::TreeBuilderConfig& config);
+extern "C" cudaError_t
+  ${FIELD}_poseidon_delete_cuda(poseidon::Poseidon<${FIELD}::scalar_t>* poseidon);
--- a/icicle/include/api/templates/fields/poseidon2.h
+++ b/icicle/include/api/templates/fields/poseidon2.h
@@ -1,30 +1,34 @@
-extern "C" cudaError_t ${FIELD}_create_poseidon2_constants_cuda(
-  int width,
-  int alpha,
-  int internal_rounds,
-  int external_rounds,
+extern "C" cudaError_t ${FIELD}_poseidon2_create_cuda(
+  poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
+  unsigned int alpha,
+  unsigned int internal_rounds,
+  unsigned int external_rounds,
  const ${FIELD}::scalar_t* round_constants,
  const ${FIELD}::scalar_t* internal_matrix_diag,
  poseidon2::MdsType mds_type,
  poseidon2::DiffusionStrategy diffusion,
-  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t ${FIELD}_init_poseidon2_constants_cuda(
-  int width,
+extern "C" cudaError_t ${FIELD}_poseidon2_load_cuda(
+  poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
+  unsigned int width,
+  unsigned int rate,
  poseidon2::MdsType mds_type,
  poseidon2::DiffusionStrategy diffusion,
-  device_context::DeviceContext& ctx,
-  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
+  device_context::DeviceContext& ctx
+);

-extern "C" cudaError_t ${FIELD}_poseidon2_hash_cuda(
-  const ${FIELD}::scalar_t* input,
+extern "C" cudaError_t ${FIELD}_poseidon2_hash_many_cuda(
+  const poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon,
+  const ${FIELD}::scalar_t* inputs,
  ${FIELD}::scalar_t* output,
-  int number_of_states,
-  int width,
-  const poseidon2::Poseidon2Constants<${FIELD}::scalar_t>& constants,
-  poseidon2::Poseidon2Config& config);
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  hash::SpongeConfig& cfg);

-extern "C" cudaError_t ${FIELD}_release_poseidon2_constants_cuda(
-  poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* constants,
-  device_context::DeviceContext& ctx);
+extern "C" cudaError_t
+  ${FIELD}_poseidon2_delete_cuda(poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon, device_context::DeviceContext& ctx);
--- a/icicle/include/api/templates/fields/tree.h
+++ b/icicle/include/api/templates/fields/tree.h
@@ -0,0 +1,16 @@
+extern "C" cudaError_t ${FIELD}_build_merkle_tree(
+  const ${FIELD}::scalar_t* leaves,
+  ${FIELD}::scalar_t* digests,
+  unsigned int height,
+  unsigned int input_block_len, 
+  const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+  const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
+  const merkle_tree::TreeBuilderConfig& tree_config);
+
+  extern "C" cudaError_t ${FIELD}_mmcs_commit_cuda(
+    const matrix::Matrix<${FIELD}::scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    ${FIELD}::scalar_t* digests,
+    const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
+    const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
+    const merkle_tree::TreeBuilderConfig& tree_config);
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -796,6 +796,14 @@ public:
    return r;
  }

+  HOST_DEVICE_INLINE Field& operator=(Field const& other)
+  {
+    for (int i = 0; i < TLC; i++) {
+      this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
+    }
+    return *this;
+  }
+
  friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
  {
    Wide xy = mul_wide(xs, ys); // full mult
--- a/icicle/include/fields/stark_fields/m31.cuh
+++ b/icicle/include/fields/stark_fields/m31.cuh
@@ -14,7 +14,7 @@ namespace m31 {
    HOST_DEVICE_INLINE MersenneField(storage<CONFIG::limbs_count> x) : Field<CONFIG>{x} {}
    HOST_DEVICE_INLINE MersenneField(const Field<CONFIG>& other) : Field<CONFIG>(other) {}

-    static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero.limbs[0]); }
+    static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero); }

    static constexpr HOST_DEVICE_INLINE MersenneField one() { return MersenneField(CONFIG::one.limbs[0]); }

--- a/icicle/include/gpu-utils/device_context.cuh
+++ b/icicle/include/gpu-utils/device_context.cuh
@@ -3,6 +3,7 @@
 #define DEVICE_CONTEXT_H

 #include <cuda_runtime.h>
+#include "gpu-utils/error_handler.cuh"

 namespace device_context {

@@ -30,6 +31,28 @@ namespace device_context {
    };
  }

-} // namespace device_context
+  // checking whether a pointer is on host or device and asserts device matches provided device
+  static bool is_host_ptr(const void* p, int device_id = 0)
+  {
+    cudaPointerAttributes attributes;
+    CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
+    const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
+                            attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
+    const bool is_on_cur_device = !is_on_host && attributes.device == device_id;
+    const bool is_valid_ptr = is_on_host || is_on_cur_device;
+    if (!is_valid_ptr) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid ptr"); }

+    return is_on_host;
+  }
+
+  static int get_cuda_device(const void* p)
+  {
+    cudaPointerAttributes attributes;
+    CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
+    const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
+                            attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
+    return is_on_host ? -1 : attributes.device;
+  }
+
+} // namespace device_context
 #endif
--- a/icicle/include/hash/hash.cuh
+++ b/icicle/include/hash/hash.cuh
@@ -0,0 +1,176 @@
+#pragma once
+#ifndef HASH_H
+#define HASH_H
+
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "matrix/matrix.cuh"
+#include <cassert>
+
+using matrix::Matrix;
+
+/**
+ * @namespace hash
+ * Includes classes and methods for describing hash functions.
+ */
+namespace hash {
+
+  /**
+   * @struct SpongeConfig
+   * Encodes sponge hash operations parameters.
+   */
+  struct SpongeConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool
+      are_outputs_on_device; /**< True if outputs are on device and false if they're on host. Default value: false. */
+    bool is_async; /**< Whether to run the hash operations asynchronously. If set to `true`, the functions will be
+                    *   non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false,
+                    *   functions will block the current CPU thread. */
+  };
+
+  /**
+   * A function that returns the default value of [SpongeConfig](@ref SpongeConfig) for the [SpongeHasher](@ref
+   * SpongeHasher) class.
+   * @return Default value of [SpongeConfig](@ref SpongeConfig).
+   */
+  static SpongeConfig
+  default_sponge_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    SpongeConfig config = {
+      ctx,   // ctx
+      false, // are_inputs_on_device
+      false, // are_outputs_on_device
+      false, // is_async
+    };
+    return config;
+  }
+
+  /**
+   * @class SpongeHasher
+   *
+   * Can be inherited by a cryptographic permutation function to create a
+   * [sponge](https://en.wikipedia.org/wiki/Sponge_function) construction out of it.
+   *
+   * @tparam PreImage type of inputs elements
+   * @tparam Image type of state elements. Also used to describe the type of hash output
+   */
+  template <typename PreImage, typename Image>
+  class SpongeHasher
+  {
+  public:
+    /// @brief the width of permutation state
+    const unsigned int width;
+
+    /// @brief how many elements a state can fit per 1 permutation. Used with domain separation.
+    const unsigned int preimage_max_length;
+
+    /// @brief portion of the state to absorb input into, or squeeze output from
+    const unsigned int rate;
+
+    /// @brief start squeezing from this offset. Used with domain separation.
+    const unsigned int offset;
+
+    SpongeHasher(unsigned int width, unsigned int preimage_max_length, unsigned int rate, unsigned int offset)
+        : width(width), preimage_max_length(preimage_max_length), rate(rate), offset(offset)
+    {
+      assert(
+        rate * sizeof(PreImage) <= preimage_max_length * sizeof(Image) &&
+        "Input rate can not be bigger than preimage max length");
+    }
+
+    virtual cudaError_t hash_2d(
+      const Matrix<PreImage>* inputs,
+      Image* states,
+      unsigned int number_of_inputs,
+      unsigned int output_len,
+      uint64_t number_of_rows,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Absorb 2d is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    };
+
+    virtual cudaError_t compress_and_inject(
+      const Matrix<PreImage>* matrices_to_inject,
+      unsigned int number_of_inputs,
+      uint64_t number_of_rows,
+      const Image* prev_layer,
+      Image* next_layer,
+      unsigned int digest_elements,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Compress and inject is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    }
+
+    /// @brief Permute aligned input and do squeeze
+    /// @param input pointer to input allocated on-device
+    /// @param out pointer to output allocated on-device
+    cudaError_t compress_many(
+      const Image* input,
+      Image* out,
+      unsigned int number_of_states,
+      unsigned int output_len,
+      const SpongeConfig& cfg) const
+    {
+      return hash_many((const PreImage*)input, out, number_of_states, width, output_len, cfg);
+    }
+
+    virtual cudaError_t run_hash_many_kernel(
+      const PreImage* input,
+      Image* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const
+    {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Hash many kernel is not implemented for this hash");
+      return cudaError_t::cudaSuccess;
+    };
+
+    cudaError_t hash_many(
+      const PreImage* input,
+      Image* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const SpongeConfig& cfg) const
+    {
+      const PreImage* d_input;
+      PreImage* d_alloc_input;
+      Image* d_output;
+      if (!cfg.are_inputs_on_device) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, number_of_states * input_len * sizeof(PreImage), cfg.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          d_alloc_input, input, number_of_states * input_len * sizeof(PreImage), cudaMemcpyHostToDevice,
+          cfg.ctx.stream));
+        d_input = d_alloc_input;
+      } else {
+        d_input = input;
+      }
+
+      if (!cfg.are_outputs_on_device) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_output, number_of_states * output_len * sizeof(Image), cfg.ctx.stream));
+      } else {
+        d_output = output;
+      }
+
+      CHK_IF_RETURN(run_hash_many_kernel(d_input, d_output, number_of_states, input_len, output_len, cfg.ctx));
+
+      if (!cfg.are_inputs_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
+      if (!cfg.are_outputs_on_device) {
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          output, d_output, number_of_states * output_len * sizeof(Image), cudaMemcpyDeviceToHost, cfg.ctx.stream));
+        CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
+      }
+
+      if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
+
+      return CHK_LAST();
+    };
+  };
+} // namespace hash
+
+#endif
--- a/icicle/include/hash/keccak/keccak.cuh
+++ b/icicle/include/hash/keccak/keccak.cuh
@@ -6,6 +6,10 @@
 #include "gpu-utils/device_context.cuh"
 #include "gpu-utils/error_handler.cuh"

+#include "hash/hash.cuh"
+
+using namespace hash;
+
 namespace keccak {
  /**
   * @struct KeccakConfig
@@ -32,25 +36,6 @@ namespace keccak {
    };
    return config;
  }
-
-  /**
-   * Compute the keccak hash over a sequence of preimages.
-   * Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
-   * / 64} u64
-   * @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
-   * @tparam D - number of bits of output. Only multiples of 64 are supported.
-   * @param input a pointer to the input data. May be allocated on device or on host, regulated
-   * by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
-   * number_of_blocks)}.
-   * @param input_block_size - size of each input block in bytes. Should be divisible by 8.
-   * @param number_of_blocks number of input and output blocks. One GPU thread processes one block
-   * @param output a pointer to the output data. May be allocated on device or on host, regulated
-   * by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
-   * number_of_blocks)}
-   */
-  template <int C, int D>
-  cudaError_t
-  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config);
 } // namespace keccak

 #endif
--- a/icicle/include/matrix/matrix.cuh
+++ b/icicle/include/matrix/matrix.cuh
@@ -0,0 +1,14 @@
+#pragma once
+#ifndef MATRIX_H
+#define MATRIX_H
+
+namespace matrix {
+  template <typename T>
+  struct Matrix {
+    T* values;
+    size_t width;
+    size_t height;
+  };
+} // namespace matrix
+
+#endif
--- a/icicle/include/merkle-tree/merkle.cuh
+++ b/icicle/include/merkle-tree/merkle.cuh
@@ -0,0 +1,128 @@
+#pragma once
+#ifndef MERKLE_H
+#define MERKLE_H
+
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "utils/utils.h"
+#include "hash/hash.cuh"
+#include "matrix/matrix.cuh"
+
+#include <vector>
+#include <numeric>
+#include <iostream>
+#include <math.h>
+
+using namespace hash;
+using matrix::Matrix;
+
+/**
+ * @namespace merkle_tree
+ * Implementation of the [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree) builder,
+ * parallelized for the use on GPU
+ */
+namespace merkle_tree {
+  static constexpr size_t GIGA = 1024 * 1024 * 1024;
+
+  /// Bytes per stream
+  static constexpr uint64_t STREAM_CHUNK_SIZE = GIGA;
+
+  /// Flattens the tree digests and sum them up to get
+  /// the memory needed to contain all the digests
+  static size_t get_digests_len(uint32_t height, uint32_t arity, uint32_t digest_elements)
+  {
+    size_t digests_len = 0;
+    size_t row_length = digest_elements;
+    for (int i = 0; i <= height; i++) {
+      digests_len += row_length;
+      row_length *= arity;
+    }
+
+    return digests_len;
+  }
+
+  template <typename T>
+  void swap(T** r, T** s)
+  {
+    T* t = *r;
+    *r = *s;
+    *s = t;
+  }
+
+  static unsigned int get_height(uint64_t number_of_elements)
+  {
+    unsigned int height = 0;
+    while (number_of_elements >>= 1)
+      ++height;
+    return height;
+  }
+
+  /**
+   * @struct TreeBuilderConfig
+   * Struct that encodes various Tree builder parameters.
+   */
+  struct TreeBuilderConfig {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    unsigned int arity;
+    unsigned int
+      keep_rows; /**< How many rows of the Merkle tree rows should be written to output. '0' means all of them */
+    unsigned int
+      digest_elements;         /** @param digest_elements the size of output for each bottom layer hash and compression.
+                                *  Will also be equal to the size of the root of the tree. Default value 1 */
+    bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool
+      are_outputs_on_device; /**< True if outputs are on device and false if they're on host. Default value: false. */
+    bool is_async; /**< Whether to run the tree builder asynchronously. If set to `true`, the build_merkle_tree
+                    *   function will be non-blocking and you'd need to synchronize it explicitly by running
+                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
+                    *   function will block the current CPU thread. */
+  };
+
+  static TreeBuilderConfig
+  default_merkle_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    TreeBuilderConfig config = {
+      ctx,   // ctx
+      2,     // arity
+      0,     // keep_rows
+      1,     // digest_elements
+      false, // are_inputes_on_device
+      false, // are_outputs_on_device
+      false, // is_async
+    };
+    return config;
+  }
+
+  /**
+   * Builds the Merkle tree
+   *
+   * @param leaves a pointer to the leaves layer. May be allocated on device or on host, regulated by the config
+   * Expected to have arity ^ (height) * input_block_len elements
+   * @param digests a pointer to the digests storage. May only be allocated on the host
+   * Expected to have `sum(digests_len * (arity ^ (i))) for i in [0..keep_rows]`
+   * @param height the height of the merkle tree
+   * @param input_block_len the size of input vectors at the bottom layer of the tree
+   * # Algorithm
+   * The function will split large tree into many subtrees of size that will fit `STREAM_CHUNK_SIZE`.
+   * Each subtree is build in it's own stream (there is a maximum number of streams)
+   * After all subtrees are constructed - the function will combine the resulting sub-digests into the final top-tree
+   */
+  template <typename Leaf, typename Digest>
+  cudaError_t build_merkle_tree(
+    const Leaf* inputs,
+    Digest* digests,
+    const SpongeHasher<Leaf, Digest>& compression,
+    const SpongeHasher<Leaf, Digest>& bottom_layer,
+    const TreeBuilderConfig& config);
+
+  template <typename Leaf, typename Digest>
+  cudaError_t mmcs_commit(
+    const Matrix<Leaf>* inputs,
+    const unsigned int number_of_inputs,
+    Digest* digests,
+    const SpongeHasher<Leaf, Digest>& hasher,
+    const SpongeHasher<Leaf, Digest>& compression,
+    const TreeBuilderConfig& tree_config);
+} // namespace merkle_tree
+
+#endif
--- a/icicle/include/poseidon/constants.cuh
+++ b/icicle/include/poseidon/constants.cuh
@@ -0,0 +1,114 @@
+#pragma once
+#ifndef POSEIDON_CONSTANTS_H
+#define POSEIDON_CONSTANTS_H
+
+#include <cstdint>
+
+namespace poseidon {
+#define FIRST_FULL_ROUNDS  true
+#define SECOND_FULL_ROUNDS false
+
+  /**
+   * For most of the Poseidon configurations this is the case
+   * TODO: Add support for different full rounds numbers
+   */
+  const int FULL_ROUNDS_DEFAULT = 4;
+
+  /**
+   * @struct PoseidonConstants
+   * This constants are enough to define a Poseidon instantce
+   * @param round_constants A pointer to round constants allocated on the device
+   * @param mds_matrix A pointer to an mds matrix allocated on the device
+   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
+   * @param sparse_matrices A pointer to sparse matrices allocated on the device
+   */
+  template <typename S>
+  struct PoseidonConstants {
+    unsigned int arity;
+    unsigned int alpha;
+    unsigned int partial_rounds;
+    unsigned int full_rounds_half;
+    S* round_constants = nullptr;
+    S* mds_matrix = nullptr;
+    S* non_sparse_matrix = nullptr;
+    S* sparse_matrices = nullptr;
+    S domain_tag = S::zero();
+
+    PoseidonConstants() = default;
+    PoseidonConstants(const PoseidonConstants& other) = default;
+
+    PoseidonConstants<S>& operator=(PoseidonConstants<S> const& other)
+    {
+      this->arity = other.arity;
+      this->alpha = other.alpha;
+      this->partial_rounds = other.partial_rounds;
+      this->full_rounds_half = other.full_rounds_half;
+      this->round_constants = other.round_constants;
+      this->mds_matrix = other.mds_matrix;
+      this->non_sparse_matrix = other.non_sparse_matrix;
+      this->sparse_matrices = other.sparse_matrices;
+      this->domain_tag = other.domain_tag;
+
+      return *this;
+    }
+  };
+
+  /**
+   * @class PoseidonKernelsConfiguration
+   * Describes the logic of deriving CUDA kernels parameters
+   * such as the number of threads and the number of blocks
+   */
+  class PoseidonKernelsConfiguration
+  {
+  public:
+    // The logic behind this is that 1 thread only works on 1 element
+    // We have {width} elements in each state, and {number_of_states} states total
+    static int number_of_threads(unsigned int width) { return 256 / width * width; }
+
+    // The partial rounds operates on the whole state, so we define
+    // the parallelism params for processing a single hash preimage per thread
+    static const int singlehash_block_size = 128;
+
+    static int hashes_per_block(unsigned int width) { return number_of_threads(width) / width; }
+
+    static int number_of_full_blocks(unsigned int width, size_t number_of_states)
+    {
+      int total_number_of_threads = number_of_states * width;
+      return total_number_of_threads / number_of_threads(width) +
+             static_cast<bool>(total_number_of_threads % number_of_threads(width));
+    }
+
+    static int number_of_singlehash_blocks(size_t number_of_states)
+    {
+      return number_of_states / singlehash_block_size + static_cast<bool>(number_of_states % singlehash_block_size);
+    }
+  };
+
+  using PKC = PoseidonKernelsConfiguration;
+
+  template <typename S>
+  cudaError_t create_optimized_poseidon_constants(
+    unsigned int arity,
+    unsigned int alpha,
+    unsigned int partial_rounds,
+    unsigned int full_rounds_half,
+    const S* round_constants,
+    const S* mds_matrix,
+    const S* non_sparse_matrix,
+    const S* sparse_matrices,
+    const S domain_tag,
+    PoseidonConstants<S>* poseidon_constants,
+    device_context::DeviceContext& ctx);
+
+  /**
+   * Loads pre-calculated optimized constants, moves them to the device
+   */
+  template <typename S>
+  cudaError_t
+  init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
+
+  template <typename S>
+  cudaError_t release_optimized_poseidon_constants(PoseidonConstants<S>* constants, device_context::DeviceContext& ctx);
+} // namespace poseidon
+
+#endif
--- a/icicle/include/poseidon/constants/generate_parameters.py
+++ b/icicle/include/poseidon/constants/generate_parameters.py
@@ -8,17 +8,18 @@ import numpy as np
 from poseidon import round_constants as rc, round_numbers as rn

 # Modify these
-arity = 11
-p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47 # grumpkin
+arity = 2
+p = 2 ** 31 - 1 # grumpkin
+# p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47 # grumpkin
 # p = 0x73EDA753299D7D483339D80809A1D80553BDA402FFFE5BFEFFFFFFFF00000001 # bls12-381
 # p = 0x12ab655e9a2ca55660b44d1e5c37b00159aa76fed00000010a11800000000001 # bls12-377
 # p = 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001 # bn254
 # p = 0x1ae3a4617c510eac63b05c06ca1493b1a22d9f300f5138f1ef3622fba094800170b5d44300000008508c00000000001 # bw6-761
-prime_bit_len = 255
-field_bytes = 32
+prime_bit_len = 31
+field_bytes = 4

 # leave set to -1 if not sure
-full_round = -1
+full_round = 8
 half_full_round = full_round // 2
 # leave set to -1 if not sure
 partial_round = -1
@@ -31,12 +32,12 @@ security_level = 128
 # F = GF(p)
 # F.primitive_element()
 #
-# primitive_element = None
+primitive_element = None
 # primitive_element = 7 # bls12-381
 # primitive_element = 22 # bls12-377
 # primitive_element = 5 # bn254
 # primitive_element = 15 # bw6-761
-primitive_element = 3 # grumpkin
+# primitive_element = 3 # grumpkin

 # currently we only support alpha 5, if you need alpha other than 5 - feal free to reach out
 alpha = 5
--- a/icicle/include/poseidon/constants/m31_poseidon.h
+++ b/icicle/include/poseidon/constants/m31_poseidon.h
@@ -0,0 +1,508 @@
+#pragma once
+#ifndef M31_POSEIDON_H
+#define M31_POSEIDON_H
+
+namespace poseidon_constants_m31 {
+  /**
+   * This inner namespace contains optimized constants for running Poseidon.
+   * These constants were generated using an algorithm defined at
+   * https://spec.filecoin.io/algorithms/crypto/poseidon/
+   * The number in the name corresponds to the arity of hash function
+   * Each array contains:
+   * RoundConstants | MDSMatrix | Non-sparse matrix | Sparse matrices
+  */
+
+  int partial_rounds_2 = 7;
+
+  int partial_rounds_4 = 11;
+
+  int partial_rounds_8 = 12;
+
+  int partial_rounds_11 = 12;
+
+    unsigned char poseidon_constants_2[] = {
+  0x33, 0x8b, 0x6d, 0x47, 0xbb, 0x97, 0x11, 0x67, 0x92, 0x9d, 0x55, 0x2d,
+  0xee, 0x1e, 0x2e, 0x45, 0xfe, 0x35, 0x0e, 0x25, 0x7e, 0xc3, 0x4f, 0x70,
+  0x4d, 0x0a, 0x8c, 0x18, 0xd9, 0x43, 0xa4, 0x61, 0xfb, 0x14, 0xd9, 0x14,
+  0x99, 0x13, 0xb9, 0x30, 0xec, 0x3b, 0x8c, 0x16, 0xcc, 0xb2, 0x0b, 0x2e,
+  0x9e, 0x18, 0xbf, 0x26, 0xb6, 0xb7, 0x2a, 0x44, 0x61, 0x29, 0xdb, 0x21,
+  0x18, 0x84, 0x03, 0x4e, 0xef, 0x95, 0xf9, 0x45, 0xe3, 0xd8, 0xf2, 0x46,
+  0x82, 0xb4, 0xc9, 0x5e, 0x5f, 0xf3, 0xb2, 0x4f, 0x61, 0x80, 0x50, 0x0f,
+  0x0d, 0x7f, 0xe3, 0x1b, 0x23, 0xbd, 0x05, 0x2f, 0x0f, 0xb1, 0x60, 0x67,
+  0xd8, 0x85, 0xdf, 0x57, 0x0c, 0x8c, 0xdf, 0x50, 0x9e, 0x65, 0x3c, 0x58,
+  0x07, 0xbd, 0x29, 0x7e, 0xc5, 0xe5, 0xa7, 0x5a, 0x5a, 0x4b, 0x0c, 0x29,
+  0x89, 0x9d, 0x14, 0x11, 0x8c, 0x20, 0xcb, 0x76, 0x4d, 0x56, 0x2d, 0x4a,
+  0x10, 0xda, 0xaf, 0x0a, 0x65, 0x9d, 0x98, 0x3e, 0xa1, 0xac, 0x57, 0x46,
+  0xcb, 0xe8, 0xfc, 0x5b, 0xd4, 0x43, 0x4b, 0x63, 0x1b, 0x13, 0x4b, 0x1f,
+  0xed, 0xac, 0xbf, 0x30, 0x27, 0x15, 0xac, 0x53, 0x4b, 0x27, 0x61, 0x3e,
+  0x37, 0xc3, 0x65, 0x74, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x20,
+  0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, 0x20, 0x33, 0x33, 0x33, 0x33,
+  0xaa, 0xaa, 0xaa, 0x6a, 0x33, 0x33, 0x33, 0x33, 0xaa, 0xaa, 0xaa, 0x6a,
+  0x6d, 0xdb, 0xb6, 0x6d, 0x55, 0x55, 0x55, 0x55, 0xc0, 0x72, 0x8d, 0x36,
+  0x2c, 0xe5, 0xc0, 0x51, 0x00, 0x00, 0x00, 0x20, 0x0b, 0xd5, 0x67, 0x6c,
+  0x6c, 0x67, 0x2c, 0x13, 0x33, 0x33, 0x33, 0x33, 0x6c, 0x67, 0x2c, 0x13,
+  0xe6, 0xb8, 0x2c, 0x62, 0x55, 0x55, 0x55, 0x55, 0x15, 0x1f, 0xaf, 0x6a,
+  0xd9, 0xa8, 0x14, 0x44, 0xae, 0xb0, 0x38, 0x4b, 0x17, 0x76, 0xd9, 0x39,
+  0x55, 0x55, 0x55, 0x55, 0x28, 0xef, 0x9d, 0x4f, 0xc7, 0x3b, 0xa6, 0x24,
+  0x84, 0x5b, 0x79, 0x6f, 0xde, 0x4f, 0x8f, 0x3d, 0x55, 0x55, 0x55, 0x55,
+  0x54, 0xc2, 0xb2, 0x00, 0x5a, 0xed, 0x68, 0x0c, 0xeb, 0xd4, 0xc4, 0x61,
+  0x02, 0x8c, 0x85, 0x27, 0x55, 0x55, 0x55, 0x55, 0xe4, 0xc5, 0xbd, 0x0a,
+  0xf6, 0xec, 0x75, 0x26, 0xe0, 0xdb, 0xd8, 0x52, 0xdf, 0x28, 0xff, 0x33,
+  0x55, 0x55, 0x55, 0x55, 0xac, 0x68, 0x06, 0x00, 0xc9, 0xff, 0x91, 0x19,
+  0xb1, 0x12, 0x2b, 0x19, 0xa2, 0xdd, 0x47, 0x39, 0x55, 0x55, 0x55, 0x55,
+  0xd5, 0x03, 0x00, 0x00, 0x45, 0xc8, 0xcc, 0x4c, 0x55, 0x55, 0x55, 0x35,
+  0x8d, 0xd6, 0x68, 0x3d, 0x55, 0x55, 0x55, 0x55, 0x03, 0x00, 0x00, 0x00,
+  0x64, 0x66, 0x66, 0x26, 0x00, 0x00, 0x00, 0x20, 0x33, 0x33, 0x33, 0x33
+};
+
+    unsigned char poseidon_constants_4[] = {
+        0xdb, 0x64, 0xa5, 0x32, 0xd6, 0x3d, 0x12, 0x6e, 0x65, 0x66, 0x46, 0x59,
+  0x2a, 0x64, 0x51, 0x3b, 0xaf, 0xbe, 0x72, 0x0b, 0x66, 0x5f, 0x5c, 0x6c,
+  0x66, 0x11, 0x8c, 0x61, 0x99, 0x24, 0x99, 0x14, 0x1d, 0x5f, 0x67, 0x0a,
+  0x4d, 0xab, 0xc4, 0x1e, 0x43, 0xb2, 0x09, 0x58, 0xc0, 0x27, 0x4c, 0x5b,
+  0xf0, 0x0c, 0xf5, 0x12, 0xc9, 0x2f, 0x88, 0x4f, 0x59, 0x52, 0x5b, 0x6a,
+  0x73, 0x90, 0x55, 0x5b, 0xaf, 0x47, 0x55, 0x0d, 0xa7, 0xc2, 0x0c, 0x6e,
+  0xe6, 0xd6, 0x4e, 0x30, 0x9e, 0x75, 0x47, 0x12, 0xca, 0x93, 0xd1, 0x5b,
+  0x64, 0x27, 0xfc, 0x60, 0x6c, 0x16, 0x52, 0x20, 0xf5, 0xe0, 0x01, 0x15,
+  0x27, 0xf9, 0x96, 0x7f, 0xa0, 0x38, 0xad, 0x3c, 0x95, 0xd3, 0xe4, 0x32,
+  0x57, 0x95, 0x5a, 0x6b, 0x12, 0xcc, 0xdc, 0x18, 0x2b, 0xdd, 0xa4, 0x66,
+  0xbf, 0xe7, 0x96, 0x15, 0x85, 0x87, 0x6a, 0x1f, 0x15, 0x19, 0x9c, 0x65,
+  0xef, 0x24, 0xaa, 0x2c, 0x3f, 0x6b, 0xbc, 0x6b, 0x54, 0x24, 0x2c, 0x17,
+  0xf1, 0x7a, 0x8d, 0x57, 0x90, 0xa4, 0xd4, 0x4a, 0x12, 0x06, 0x77, 0x6a,
+  0xe8, 0x6b, 0xd9, 0x51, 0x80, 0x72, 0xa1, 0x31, 0xce, 0xa8, 0x59, 0x10,
+  0x0c, 0x90, 0xd4, 0x10, 0x8e, 0x60, 0x54, 0x1c, 0xe7, 0xfd, 0x42, 0x3a,
+  0x73, 0xc1, 0xcc, 0x4f, 0x58, 0xbb, 0x99, 0x7c, 0xd2, 0x51, 0xda, 0x43,
+  0xea, 0x6e, 0xe8, 0x16, 0xb2, 0x51, 0x53, 0x61, 0x7e, 0x68, 0x44, 0x3c,
+  0x33, 0x33, 0x33, 0x33, 0xaa, 0xaa, 0xaa, 0x6a, 0x6d, 0xdb, 0xb6, 0x6d,
+  0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71, 0xaa, 0xaa, 0xaa, 0x6a,
+  0x6d, 0xdb, 0xb6, 0x6d, 0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71,
+  0x99, 0x99, 0x99, 0x59, 0x6d, 0xdb, 0xb6, 0x6d, 0x00, 0x00, 0x00, 0x10,
+  0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
+  0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59,
+  0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35, 0x71, 0x1c, 0xc7, 0x71,
+  0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35,
+  0xd8, 0x89, 0x9d, 0x58, 0x33, 0x33, 0x33, 0x33, 0xae, 0x9d, 0xba, 0x61,
+  0x09, 0xf2, 0xee, 0x53, 0x5e, 0x5c, 0xe8, 0x61, 0x8e, 0x1a, 0x60, 0x6c,
+  0xaa, 0xaa, 0xaa, 0x6a, 0xff, 0x1a, 0xb7, 0x09, 0x1d, 0x84, 0x75, 0x5e,
+  0x88, 0x5e, 0x36, 0x25, 0x6b, 0xd4, 0xdd, 0x65, 0x6d, 0xdb, 0xb6, 0x6d,
+  0x1d, 0x84, 0x75, 0x5e, 0x10, 0x9d, 0x2d, 0x63, 0xa7, 0x62, 0xfc, 0x1f,
+  0xe2, 0x43, 0x63, 0x14, 0x00, 0x00, 0x00, 0x10, 0x88, 0x5e, 0x36, 0x25,
+  0xa7, 0x62, 0xfc, 0x1f, 0x47, 0xa0, 0x19, 0x6f, 0x48, 0x1f, 0x4e, 0x22,
+  0x71, 0x1c, 0xc7, 0x71, 0x6b, 0xd4, 0xdd, 0x65, 0xe2, 0x43, 0x63, 0x14,
+  0x48, 0x1f, 0x4e, 0x22, 0xb7, 0x4e, 0x73, 0x01, 0x33, 0x33, 0x33, 0x33,
+  0x84, 0xdd, 0xf7, 0x08, 0x6f, 0xc5, 0x14, 0x63, 0xb6, 0x22, 0x01, 0x3d,
+  0xcd, 0xab, 0x7d, 0x62, 0xac, 0x7e, 0x61, 0x57, 0x40, 0x6b, 0xc5, 0x45,
+  0x77, 0xbc, 0x02, 0x18, 0x8c, 0x66, 0xda, 0x74, 0x33, 0x33, 0x33, 0x33,
+  0x01, 0x9d, 0x33, 0x55, 0xed, 0x7d, 0x75, 0x63, 0x41, 0x92, 0x33, 0x76,
+  0x6b, 0xd5, 0x10, 0x23, 0x1a, 0xc4, 0x49, 0x5b, 0x0c, 0x86, 0x5a, 0x60,
+  0x23, 0xe5, 0xd8, 0x1c, 0x43, 0xe9, 0xe2, 0x0d, 0x33, 0x33, 0x33, 0x33,
+  0x1b, 0x68, 0xec, 0x17, 0x0e, 0x3f, 0x34, 0x1a, 0xb0, 0x28, 0xe9, 0x6c,
+  0xc0, 0xf7, 0x3e, 0x79, 0xdc, 0x08, 0x9e, 0x32, 0x45, 0xde, 0xea, 0x73,
+  0x7a, 0xc4, 0xb4, 0x0d, 0x65, 0xb6, 0x61, 0x04, 0x33, 0x33, 0x33, 0x33,
+  0x41, 0x01, 0x02, 0x6b, 0xd8, 0x62, 0x6b, 0x47, 0x47, 0xd9, 0x7e, 0x72,
+  0x4f, 0x80, 0x31, 0x54, 0x8b, 0x5e, 0x3e, 0x26, 0x64, 0x16, 0xe2, 0x51,
+  0xf4, 0xa6, 0xed, 0x35, 0xc3, 0xe9, 0xc5, 0x41, 0x33, 0x33, 0x33, 0x33,
+  0xd5, 0x3f, 0xed, 0x11, 0xf5, 0x0f, 0x56, 0x41, 0xf6, 0x0d, 0xf3, 0x78,
+  0xb0, 0x78, 0xa1, 0x7d, 0x5d, 0x33, 0xc4, 0x5e, 0xa6, 0xd9, 0x47, 0x4c,
+  0x07, 0xc3, 0x30, 0x5a, 0x91, 0x10, 0x31, 0x20, 0x33, 0x33, 0x33, 0x33,
+  0xa5, 0xec, 0xe5, 0x25, 0xe6, 0xa7, 0x4e, 0x01, 0xee, 0x3a, 0xe7, 0x62,
+  0x02, 0xfd, 0xf9, 0x08, 0xdd, 0x91, 0x3f, 0x2d, 0xca, 0xbc, 0xb5, 0x2c,
+  0x54, 0x9e, 0xd4, 0x78, 0x6b, 0x18, 0x94, 0x21, 0x33, 0x33, 0x33, 0x33,
+  0xe6, 0xb3, 0xd2, 0x2e, 0x49, 0xdb, 0xa8, 0x52, 0x5f, 0x6a, 0x75, 0x59,
+  0xd5, 0x45, 0x5c, 0x73, 0x40, 0xe4, 0xd8, 0x2a, 0x8c, 0xe6, 0xda, 0x50,
+  0x5f, 0x4f, 0x18, 0x5d, 0xf4, 0xa4, 0xf4, 0x46, 0x33, 0x33, 0x33, 0x33,
+  0x3e, 0x90, 0x5b, 0x3a, 0x55, 0x96, 0x22, 0x7c, 0xd9, 0x64, 0x36, 0x4e,
+  0x0b, 0xec, 0x66, 0x65, 0xac, 0x55, 0xa9, 0x19, 0x50, 0x87, 0x49, 0x1a,
+  0x1f, 0x78, 0x89, 0x36, 0x25, 0x2a, 0x06, 0x55, 0x33, 0x33, 0x33, 0x33,
+  0x6b, 0xf1, 0x61, 0x67, 0x67, 0x00, 0xc5, 0x24, 0x9e, 0xd1, 0x94, 0x6f,
+  0xbf, 0x8b, 0xaf, 0x2d, 0x69, 0x9c, 0xb7, 0x62, 0xf8, 0x0a, 0x43, 0x13,
+  0x3c, 0xc0, 0x48, 0x3e, 0x9f, 0x3f, 0xa8, 0x2c, 0x33, 0x33, 0x33, 0x33,
+  0x9d, 0x5b, 0xb2, 0x2b, 0x62, 0x05, 0x39, 0x20, 0x52, 0x1f, 0xe8, 0x05,
+  0x1b, 0x24, 0xc0, 0x13, 0x11, 0x11, 0x11, 0x11, 0x9c, 0x6a, 0x35, 0x45,
+  0xf6, 0x7f, 0x5c, 0x4c, 0x9f, 0xc4, 0x8f, 0x1f, 0x33, 0x33, 0x33, 0x33,
+  0xb1, 0xaa, 0xaa, 0x2a, 0xcb, 0xb6, 0x6d, 0x5b, 0x34, 0x49, 0x92, 0x24,
+  0x90, 0x65, 0x59, 0x56, 0xaa, 0xaa, 0xaa, 0x6a, 0x6d, 0xdb, 0xb6, 0x6d,
+  0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71
+    };
+
+    unsigned char poseidon_constants_8[] = {
+0x90, 0xaf, 0x71, 0x3e, 0xa3, 0xbe, 0x5a, 0x30, 0xd4, 0x1b, 0x6f, 0x5d,
+  0xeb, 0x36, 0x6b, 0x53, 0x14, 0xc0, 0x30, 0x13, 0xd5, 0xf8, 0x0b, 0x1c,
+  0xa8, 0x66, 0xf1, 0x3c, 0xbd, 0x64, 0xa3, 0x6c, 0x06, 0x5e, 0x95, 0x7c,
+  0xee, 0xc4, 0x0a, 0x0f, 0x37, 0x03, 0xba, 0x6d, 0x20, 0x85, 0xf1, 0x2c,
+  0xee, 0x59, 0x21, 0x11, 0x42, 0xae, 0xb7, 0x3c, 0x73, 0xb4, 0xd6, 0x71,
+  0x6a, 0x29, 0x40, 0x03, 0x86, 0xd8, 0x32, 0x68, 0x61, 0x62, 0x62, 0x32,
+  0x44, 0x5d, 0xcc, 0x38, 0x76, 0x0f, 0xbc, 0x1f, 0xc9, 0x6e, 0x67, 0x1d,
+  0x95, 0x35, 0x10, 0x79, 0x45, 0xaa, 0x0f, 0x7c, 0x73, 0xfa, 0x5d, 0x3f,
+  0x53, 0xf2, 0xdc, 0x21, 0x37, 0xfa, 0x15, 0x04, 0xfd, 0x31, 0x3d, 0x5d,
+  0x5d, 0xe6, 0x1d, 0x4a, 0xb3, 0x2b, 0xa2, 0x07, 0x2d, 0x48, 0x07, 0x2b,
+  0x92, 0x1c, 0x31, 0x52, 0x6c, 0xd3, 0x32, 0x2f, 0x0f, 0xdd, 0x82, 0x7d,
+  0x41, 0x0e, 0x81, 0x7e, 0x60, 0xfb, 0x49, 0x7b, 0xe5, 0x39, 0x3d, 0x75,
+  0x6d, 0xcf, 0x02, 0x77, 0x0d, 0xf6, 0xf8, 0x0c, 0x43, 0xae, 0x62, 0x5e,
+  0x26, 0x36, 0x9e, 0x3a, 0x10, 0xe3, 0x59, 0x4b, 0x3a, 0x59, 0x49, 0x73,
+  0x31, 0x20, 0xb9, 0x40, 0x39, 0xed, 0xaf, 0x37, 0x6d, 0x5c, 0x4c, 0x6a,
+  0xce, 0xca, 0xc4, 0x33, 0x53, 0x96, 0x92, 0x1d, 0xb2, 0xa1, 0xac, 0x65,
+  0xbb, 0x43, 0xc4, 0x16, 0xf9, 0x38, 0x10, 0x67, 0x3d, 0xbb, 0x28, 0x7a,
+  0x2b, 0x1e, 0x65, 0x36, 0x07, 0x14, 0x36, 0x3c, 0xcb, 0xdf, 0x03, 0x6b,
+  0x03, 0x7b, 0xe6, 0x67, 0x79, 0x2a, 0x08, 0x47, 0xb7, 0x8f, 0x9c, 0x7e,
+  0x54, 0xde, 0x08, 0x0a, 0xf8, 0x99, 0x24, 0x6f, 0x64, 0x78, 0x80, 0x5f,
+  0x43, 0x76, 0x77, 0x40, 0x12, 0x62, 0x71, 0x10, 0x35, 0xf5, 0xdd, 0x0a,
+  0x06, 0xff, 0x9b, 0x7b, 0xd8, 0x1a, 0xf3, 0x50, 0x1d, 0xc3, 0x8c, 0x60,
+  0xe0, 0x61, 0xf5, 0x3d, 0xf9, 0xbf, 0xe4, 0x38, 0x78, 0xbf, 0x59, 0x0e,
+  0xed, 0xc9, 0x4d, 0x0b, 0xb1, 0x7a, 0x10, 0x2b, 0x84, 0x27, 0x07, 0x70,
+  0x5d, 0xc0, 0xa4, 0x7e, 0x9c, 0xf0, 0xf6, 0x69, 0x89, 0x6c, 0xc5, 0x39,
+  0x4a, 0x7d, 0x5e, 0x26, 0x2f, 0x08, 0x9d, 0x05, 0xdc, 0x71, 0xec, 0x08,
+  0x2b, 0xca, 0x68, 0x14, 0x42, 0xf6, 0xe6, 0x0a, 0x2f, 0xa5, 0x34, 0x6d,
+  0x95, 0xaa, 0x80, 0x55, 0x23, 0x0f, 0x5f, 0x20, 0xbe, 0x4d, 0x0b, 0x20,
+  0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
+  0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35,
+  0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
+  0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
+  0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58,
+  0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
+  0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
+  0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
+  0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
+  0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
+  0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
+  0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
+  0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
+  0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
+  0x71, 0x1c, 0xc7, 0x71, 0x50, 0x05, 0xd7, 0x30, 0x09, 0x94, 0x4f, 0x13,
+  0x11, 0x86, 0x4b, 0x61, 0x74, 0x8b, 0x94, 0x0e, 0x7e, 0x5d, 0x93, 0x27,
+  0xeb, 0xb6, 0x4b, 0x61, 0x90, 0x3f, 0x9b, 0x7d, 0x10, 0xe9, 0x16, 0x06,
+  0x99, 0x99, 0x99, 0x59, 0x4f, 0xf6, 0x15, 0x6b, 0x84, 0x8c, 0xe0, 0x5f,
+  0x88, 0x9e, 0xb2, 0x08, 0x32, 0x36, 0xe3, 0x25, 0x64, 0x0a, 0xf5, 0x6f,
+  0x80, 0xff, 0x8e, 0x6f, 0xcd, 0xb5, 0x72, 0x12, 0x90, 0xa2, 0x7a, 0x09,
+  0x45, 0x17, 0x5d, 0x74, 0x84, 0x8c, 0xe0, 0x5f, 0xf5, 0x67, 0x02, 0x2d,
+  0x71, 0x83, 0xf0, 0x55, 0x81, 0xa2, 0x81, 0x4b, 0xec, 0xff, 0xb0, 0x6b,
+  0x17, 0x41, 0xd6, 0x36, 0xf3, 0x16, 0x58, 0x23, 0x49, 0x90, 0xa2, 0x17,
+  0x55, 0x55, 0x55, 0x35, 0x88, 0x9e, 0xb2, 0x08, 0x71, 0x83, 0xf0, 0x55,
+  0x27, 0x2a, 0xb0, 0x29, 0x0b, 0xe4, 0x53, 0x70, 0x7f, 0xeb, 0x60, 0x74,
+  0xb9, 0x92, 0xa9, 0x4b, 0x51, 0x41, 0x0e, 0x56, 0x1b, 0xe4, 0x67, 0x43,
+  0xd8, 0x89, 0x9d, 0x58, 0x32, 0x36, 0xe3, 0x25, 0x81, 0xa2, 0x81, 0x4b,
+  0x0b, 0xe4, 0x53, 0x70, 0x73, 0x99, 0xf0, 0x02, 0x1a, 0xf7, 0xe1, 0x40,
+  0x18, 0xc4, 0x58, 0x3a, 0xcc, 0xf5, 0x0b, 0x18, 0xf0, 0x39, 0xab, 0x7a,
+  0xb6, 0x6d, 0xdb, 0x76, 0x64, 0x0a, 0xf5, 0x6f, 0xec, 0xff, 0xb0, 0x6b,
+  0x7f, 0xeb, 0x60, 0x74, 0x1a, 0xf7, 0xe1, 0x40, 0xf7, 0xfc, 0xbe, 0x7f,
+  0xbf, 0x63, 0xc5, 0x05, 0x15, 0x3c, 0x9f, 0x2b, 0x9b, 0x77, 0xb0, 0x44,
+  0x11, 0x11, 0x11, 0x11, 0x80, 0xff, 0x8e, 0x6f, 0x17, 0x41, 0xd6, 0x36,
+  0xb9, 0x92, 0xa9, 0x4b, 0x18, 0xc4, 0x58, 0x3a, 0xbf, 0x63, 0xc5, 0x05,
+  0x2f, 0x5c, 0x3c, 0x09, 0x25, 0xaf, 0xdf, 0x11, 0x21, 0x7d, 0x95, 0x58,
+  0x00, 0x00, 0x00, 0x08, 0xcd, 0xb5, 0x72, 0x12, 0xf3, 0x16, 0x58, 0x23,
+  0x51, 0x41, 0x0e, 0x56, 0xcc, 0xf5, 0x0b, 0x18, 0x15, 0x3c, 0x9f, 0x2b,
+  0x25, 0xaf, 0xdf, 0x11, 0x38, 0x50, 0xe9, 0x16, 0x12, 0xb8, 0xc8, 0x17,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x90, 0xa2, 0x7a, 0x09, 0x49, 0x90, 0xa2, 0x17,
+  0x1b, 0xe4, 0x67, 0x43, 0xf0, 0x39, 0xab, 0x7a, 0x9b, 0x77, 0xb0, 0x44,
+  0x21, 0x7d, 0x95, 0x58, 0x12, 0xb8, 0xc8, 0x17, 0x5a, 0xfc, 0xf7, 0x5c,
+  0x71, 0x1c, 0xc7, 0x71, 0xdb, 0x50, 0x89, 0x38, 0x5f, 0x88, 0xe3, 0x32,
+  0x8b, 0xb4, 0x3b, 0x6c, 0x95, 0x0a, 0xf1, 0x41, 0xe6, 0x0a, 0x52, 0x7d,
+  0xd1, 0x0d, 0xb1, 0x57, 0x9b, 0xd2, 0xf4, 0x1d, 0x80, 0x17, 0xb2, 0x42,
+  0x9c, 0x40, 0x6e, 0x2f, 0x63, 0xa7, 0x42, 0x77, 0xf9, 0x37, 0xd1, 0x43,
+  0x98, 0xd1, 0xec, 0x50, 0x91, 0x26, 0xfa, 0x4e, 0x0c, 0x9e, 0xcc, 0x31,
+  0x52, 0xf4, 0x20, 0x5d, 0x2a, 0x20, 0xeb, 0x1b, 0x71, 0x1c, 0xc7, 0x71,
+  0x54, 0x29, 0xf4, 0x4a, 0xde, 0x91, 0xf6, 0x54, 0x8b, 0xed, 0x18, 0x26,
+  0x71, 0x24, 0x22, 0x34, 0xb7, 0xaf, 0x61, 0x27, 0x7a, 0x0a, 0x21, 0x7f,
+  0x9f, 0xfe, 0xa1, 0x53, 0x26, 0x97, 0x6b, 0x5b, 0xf4, 0xea, 0xef, 0x4a,
+  0x4b, 0x03, 0xa0, 0x7c, 0xe6, 0x64, 0x69, 0x47, 0x76, 0xf7, 0x2d, 0x0b,
+  0x6f, 0xd5, 0x2c, 0x45, 0x52, 0xc1, 0x5c, 0x46, 0x25, 0x38, 0xab, 0x79,
+  0x64, 0xed, 0xe7, 0x57, 0x71, 0x1c, 0xc7, 0x71, 0x94, 0xc2, 0xb7, 0x7f,
+  0xaf, 0x0d, 0x61, 0x4c, 0xa3, 0x86, 0x8e, 0x45, 0xdc, 0x73, 0xe3, 0x77,
+  0x71, 0xed, 0x21, 0x7d, 0x4b, 0x8e, 0xc7, 0x52, 0x39, 0x5d, 0x49, 0x1d,
+  0x75, 0x35, 0xed, 0x09, 0xc6, 0x02, 0x3b, 0x22, 0xb8, 0x91, 0x07, 0x13,
+  0x7f, 0xbf, 0x15, 0x7f, 0xb5, 0xbe, 0x0a, 0x5c, 0xbc, 0x75, 0x54, 0x61,
+  0x6c, 0x2f, 0x28, 0x5f, 0xff, 0xf0, 0x7b, 0x67, 0x11, 0x8e, 0x70, 0x29,
+  0x71, 0x1c, 0xc7, 0x71, 0xe6, 0xfc, 0x29, 0x07, 0xbd, 0x0c, 0x4d, 0x5f,
+  0x57, 0xb7, 0x87, 0x41, 0xec, 0x48, 0xda, 0x18, 0x78, 0x41, 0xb8, 0x6d,
+  0xde, 0x7e, 0x47, 0x5a, 0x13, 0x03, 0xc5, 0x52, 0x2e, 0xee, 0xf3, 0x3f,
+  0x06, 0xd0, 0xcd, 0x48, 0x77, 0x2a, 0xcd, 0x7e, 0x35, 0xee, 0x74, 0x63,
+  0x3e, 0x26, 0x65, 0x64, 0x37, 0xa1, 0xfb, 0x7a, 0x03, 0x44, 0xa8, 0x70,
+  0x2f, 0x03, 0x27, 0x1e, 0xb3, 0x02, 0x3e, 0x4a, 0x71, 0x1c, 0xc7, 0x71,
+  0xfd, 0xe1, 0xfe, 0x3c, 0x88, 0x1c, 0x36, 0x53, 0x36, 0x31, 0x5a, 0x32,
+  0x88, 0x7b, 0xa6, 0x17, 0x40, 0x31, 0xe4, 0x0a, 0xb3, 0x70, 0x8f, 0x4f,
+  0xc3, 0xa2, 0xd7, 0x06, 0x34, 0x9d, 0x4a, 0x71, 0x5b, 0xfa, 0x79, 0x25,
+  0xe8, 0x6f, 0x05, 0x65, 0xc1, 0x4a, 0xee, 0x5c, 0x9a, 0xb2, 0x83, 0x05,
+  0xb0, 0x89, 0x77, 0x2e, 0xc1, 0x56, 0x34, 0x08, 0x50, 0xf5, 0xde, 0x12,
+  0xae, 0x68, 0xc2, 0x1b, 0x71, 0x1c, 0xc7, 0x71, 0xb3, 0x84, 0x6e, 0x4f,
+  0xae, 0x74, 0x57, 0x4f, 0x56, 0xf3, 0xfc, 0x48, 0xfa, 0x73, 0xd7, 0x0e,
+  0x8a, 0xc5, 0x35, 0x4d, 0xf6, 0x26, 0x15, 0x2a, 0xcf, 0xb5, 0x2d, 0x64,
+  0xd1, 0x2a, 0x84, 0x43, 0xab, 0xc0, 0xec, 0x60, 0xa9, 0xbc, 0x09, 0x11,
+  0xfd, 0x06, 0xea, 0x1e, 0xba, 0x29, 0x77, 0x6c, 0xb1, 0x37, 0xa5, 0x42,
+  0x1c, 0x9b, 0x58, 0x37, 0xa8, 0xb7, 0xae, 0x3e, 0x6a, 0xf8, 0x63, 0x25,
+  0x71, 0x1c, 0xc7, 0x71, 0x22, 0xa0, 0x75, 0x4e, 0x17, 0x33, 0x99, 0x7c,
+  0x97, 0x97, 0x30, 0x04, 0xbc, 0x22, 0x6d, 0x7c, 0xb3, 0xd7, 0xd9, 0x56,
+  0x4e, 0xef, 0x40, 0x5e, 0x02, 0x05, 0x51, 0x1e, 0x0c, 0x32, 0xb7, 0x06,
+  0x41, 0x16, 0x80, 0x33, 0xc2, 0xdd, 0x8f, 0x18, 0x65, 0xa3, 0xe1, 0x4a,
+  0xdb, 0xb4, 0x5d, 0x78, 0xf3, 0x99, 0x48, 0x3e, 0x04, 0x5b, 0xb9, 0x09,
+  0xd2, 0x3d, 0x14, 0x05, 0x69, 0x50, 0xe9, 0x57, 0x71, 0x1c, 0xc7, 0x71,
+  0x0d, 0x72, 0x37, 0x6c, 0xe3, 0xd1, 0x57, 0x2f, 0x9e, 0xb7, 0xe1, 0x30,
+  0x22, 0xce, 0xe5, 0x66, 0x45, 0x7b, 0x06, 0x0e, 0x06, 0x66, 0xdd, 0x11,
+  0xef, 0xdf, 0x61, 0x52, 0x7d, 0xb9, 0xcf, 0x1e, 0x97, 0xbe, 0x55, 0x00,
+  0x94, 0xcb, 0x50, 0x7c, 0xa0, 0x83, 0x1c, 0x57, 0xf3, 0x72, 0x8c, 0x40,
+  0x07, 0x32, 0x39, 0x54, 0xe8, 0x5a, 0x10, 0x7b, 0x09, 0xc2, 0x02, 0x58,
+  0xb0, 0xeb, 0x23, 0x51, 0x71, 0x1c, 0xc7, 0x71, 0xf0, 0xfd, 0x78, 0x2c,
+  0xe7, 0xa8, 0x53, 0x7c, 0xdd, 0xf6, 0xa3, 0x2b, 0xa9, 0x51, 0xf4, 0x33,
+  0x1d, 0x4d, 0x13, 0x0e, 0x53, 0x6b, 0xde, 0x6b, 0x48, 0x46, 0xa0, 0x01,
+  0xbf, 0x74, 0xf2, 0x14, 0xe5, 0x99, 0x3d, 0x72, 0x37, 0x8e, 0xa9, 0x44,
+  0x61, 0xed, 0xdd, 0x3b, 0x7c, 0x11, 0x28, 0x12, 0xd5, 0xd6, 0x27, 0x78,
+  0x4e, 0xf8, 0xe4, 0x3d, 0xdc, 0x5c, 0x92, 0x0c, 0xea, 0x5b, 0xe2, 0x44,
+  0x71, 0x1c, 0xc7, 0x71, 0x64, 0x55, 0xb2, 0x0d, 0x54, 0x7f, 0x64, 0x72,
+  0x8e, 0xe1, 0x7b, 0x52, 0xf5, 0xe4, 0x20, 0x13, 0xd1, 0xd4, 0x5d, 0x4c,
+  0x33, 0x3d, 0xb6, 0x55, 0x26, 0xed, 0xb0, 0x75, 0xa0, 0xf2, 0x72, 0x51,
+  0x6b, 0xc5, 0x37, 0x23, 0x0d, 0x1d, 0xf5, 0x6f, 0xa6, 0x83, 0x5f, 0x3e,
+  0x1e, 0xb5, 0x18, 0x23, 0xc8, 0x40, 0xae, 0x63, 0x68, 0x79, 0x8e, 0x56,
+  0xb0, 0x33, 0x43, 0x08, 0x5b, 0xac, 0x52, 0x39, 0x71, 0x1c, 0xc7, 0x71,
+  0x9d, 0xf2, 0x00, 0x73, 0xf8, 0x96, 0xbb, 0x43, 0x5b, 0x59, 0xce, 0x07,
+  0xbb, 0x11, 0xc8, 0x43, 0xde, 0xea, 0xb7, 0x34, 0x51, 0xbf, 0xa7, 0x2d,
+  0x33, 0x35, 0xc2, 0x40, 0x1c, 0x81, 0x60, 0x63, 0x60, 0x0b, 0xb6, 0x60,
+  0xbf, 0xb9, 0x38, 0x0c, 0x02, 0x54, 0x53, 0x20, 0xd9, 0xf9, 0xeb, 0x2f,
+  0x7e, 0x5b, 0xdf, 0x58, 0x4b, 0x99, 0x8e, 0x04, 0x27, 0xb4, 0x18, 0x78,
+  0xd6, 0x37, 0x16, 0x60, 0x71, 0x1c, 0xc7, 0x71, 0x74, 0x66, 0x66, 0x66,
+  0xb2, 0xf1, 0x94, 0x20, 0xad, 0x2f, 0xba, 0x68, 0x6a, 0x33, 0xfe, 0x6e,
+  0xa5, 0x51, 0xec, 0x44, 0xab, 0x05, 0x7e, 0x60, 0x48, 0x6b, 0xa5, 0x56,
+  0x38, 0x3d, 0xc7, 0x24, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
+  0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f
+    };
+
+    unsigned char poseidon_constants_11[] = {
+        0xb0, 0xf1, 0x1f, 0x2e, 0xf8, 0x8b, 0xb5, 0x07, 0x8d, 0xc4, 0xe1, 0x46,
+  0x99, 0x23, 0x9f, 0x06, 0xcc, 0x64, 0x13, 0x45, 0x9e, 0xb1, 0xdf, 0x5f,
+  0xfa, 0x8e, 0x0f, 0x6f, 0x33, 0xd8, 0xfe, 0x19, 0x0a, 0x25, 0x8b, 0x20,
+  0xe1, 0x2c, 0xcc, 0x36, 0x17, 0x3f, 0x03, 0x05, 0xe1, 0x13, 0xce, 0x35,
+  0xd4, 0xc9, 0xe7, 0x65, 0x1f, 0x7f, 0x2c, 0x7a, 0x93, 0x9f, 0x34, 0x19,
+  0x4d, 0x22, 0xf2, 0x7f, 0x8e, 0xa8, 0xb0, 0x51, 0x22, 0x8c, 0x91, 0x30,
+  0xa5, 0x9c, 0xff, 0x31, 0x0e, 0x04, 0xc9, 0x19, 0x69, 0x60, 0xee, 0x0f,
+  0xc5, 0xa5, 0xeb, 0x6b, 0xb0, 0xa4, 0xaa, 0x5d, 0x1c, 0x4e, 0xeb, 0x73,
+  0xec, 0x94, 0xb7, 0x15, 0xce, 0x64, 0x1c, 0x60, 0x3e, 0xa3, 0x6b, 0x4a,
+  0x87, 0x7a, 0x25, 0x2f, 0xfc, 0xc3, 0x17, 0x20, 0x06, 0xb6, 0x22, 0x7d,
+  0xca, 0xea, 0x8b, 0x3b, 0xf9, 0xca, 0xa4, 0x32, 0xd2, 0xb7, 0x2e, 0x01,
+  0x4f, 0x31, 0xc9, 0x2f, 0x10, 0xbf, 0x41, 0x4c, 0xe6, 0xfe, 0xba, 0x49,
+  0xe5, 0x89, 0xbb, 0x77, 0x7e, 0xe8, 0x83, 0x1c, 0x72, 0xe7, 0x26, 0x58,
+  0x24, 0x90, 0x9d, 0x1e, 0xb3, 0x20, 0xc8, 0x64, 0x84, 0xa3, 0x21, 0x5d,
+  0x06, 0x64, 0x30, 0x4b, 0x19, 0x35, 0x96, 0x1e, 0xd1, 0x86, 0x57, 0x4a,
+  0xb3, 0x8e, 0xd6, 0x7d, 0xaf, 0xd1, 0xde, 0x3f, 0xa2, 0x2c, 0x32, 0x0a,
+  0xbb, 0xea, 0x4a, 0x46, 0x64, 0x1b, 0x72, 0x14, 0x75, 0x85, 0x1b, 0x4d,
+  0x11, 0x02, 0x5f, 0x6f, 0x06, 0xdd, 0xd3, 0x6f, 0xbc, 0xcc, 0x77, 0x2e,
+  0xb7, 0x43, 0xf4, 0x19, 0x9d, 0x2c, 0x4b, 0x2b, 0x0c, 0x41, 0xb9, 0x02,
+  0xdc, 0x14, 0x5a, 0x67, 0xd4, 0x56, 0xca, 0x45, 0x65, 0xd2, 0x7d, 0x17,
+  0xcd, 0x91, 0xdd, 0x45, 0xd8, 0xa8, 0xd8, 0x4b, 0xc9, 0x2b, 0xf2, 0x35,
+  0xc1, 0x81, 0x6c, 0x33, 0xbc, 0xf4, 0x4d, 0x04, 0xfd, 0xb0, 0x91, 0x2b,
+  0xcf, 0xad, 0x39, 0x45, 0x35, 0xb2, 0xac, 0x2e, 0x2f, 0x13, 0xe3, 0x0b,
+  0x40, 0x59, 0x33, 0x07, 0xe3, 0xa5, 0xa1, 0x4d, 0x0e, 0x79, 0x05, 0x4c,
+  0x36, 0x9b, 0xf1, 0x7f, 0x90, 0x50, 0x46, 0x25, 0x87, 0x10, 0x24, 0x3f,
+  0x52, 0x5d, 0xff, 0x18, 0xad, 0xed, 0x78, 0x52, 0x00, 0x9c, 0xfe, 0x66,
+  0x22, 0x24, 0xe0, 0x62, 0x13, 0xe2, 0x6f, 0x67, 0xd9, 0xe3, 0x6c, 0x64,
+  0x6b, 0xa6, 0xea, 0x53, 0x61, 0x56, 0x8a, 0x33, 0x81, 0x35, 0xe5, 0x0f,
+  0x35, 0xc9, 0xf3, 0x59, 0xc2, 0xa8, 0x92, 0x73, 0x69, 0x66, 0x05, 0x70,
+  0xa1, 0x5f, 0xec, 0x4e, 0x3d, 0x6b, 0xc0, 0x78, 0xa4, 0xcb, 0xfc, 0x7e,
+  0x44, 0x8c, 0xc4, 0x1b, 0x25, 0x70, 0x8f, 0x27, 0x87, 0x76, 0x2d, 0x4f,
+  0x70, 0xb0, 0xea, 0x7a, 0x92, 0x43, 0x8c, 0x00, 0xed, 0xfd, 0x3b, 0x23,
+  0x69, 0x71, 0x8e, 0x49, 0x83, 0xc3, 0x4e, 0x37, 0xab, 0x18, 0xd9, 0x30,
+  0x4d, 0x48, 0x5e, 0x7e, 0xbc, 0x5a, 0x1a, 0x24, 0x34, 0xed, 0x19, 0x57,
+  0xf4, 0xf4, 0x0d, 0x02, 0x0c, 0x57, 0xde, 0x6d, 0x40, 0x39, 0x1f, 0x71,
+  0x9c, 0xa1, 0xb0, 0x28, 0x2d, 0x05, 0xb9, 0x6b, 0x85, 0x7a, 0x4c, 0x47,
+  0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
+  0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
+  0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
+  0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
+  0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
+  0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
+  0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
+  0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
+  0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
+  0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
+  0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
+  0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
+  0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
+  0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
+  0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
+  0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11,
+  0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
+  0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
+  0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
+  0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
+  0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
+  0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
+  0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
+  0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
+  0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11,
+  0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b, 0x00, 0x00, 0x00, 0x04,
+  0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
+  0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
+  0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48,
+  0xbd, 0xf7, 0xde, 0x7b, 0x00, 0x00, 0x00, 0x04, 0xc1, 0x07, 0x1f, 0x7c,
+  0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
+  0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
+  0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b,
+  0x00, 0x00, 0x00, 0x04, 0xc1, 0x07, 0x1f, 0x7c, 0x87, 0x87, 0x87, 0x47,
+  0x55, 0x55, 0x55, 0x35, 0x7c, 0xec, 0xe8, 0x54, 0x5f, 0xc4, 0x1c, 0x7e,
+  0x02, 0x38, 0x4e, 0x55, 0x86, 0x80, 0x6d, 0x71, 0xc3, 0xa8, 0x98, 0x4a,
+  0x2b, 0xaa, 0x86, 0x63, 0x60, 0xd7, 0x4f, 0x2e, 0xb4, 0xac, 0xce, 0x78,
+  0xbd, 0x1c, 0x4f, 0x55, 0x6b, 0x2c, 0x33, 0x64, 0x8c, 0x56, 0x30, 0x43,
+  0xd8, 0x89, 0x9d, 0x58, 0xdd, 0x29, 0xc3, 0x15, 0x02, 0x15, 0x5b, 0x4f,
+  0xdc, 0xb9, 0x0c, 0x03, 0x9a, 0x8d, 0x4d, 0x53, 0x6e, 0xf2, 0x33, 0x15,
+  0xed, 0x3f, 0x16, 0x06, 0x43, 0xab, 0x59, 0x54, 0x1a, 0x62, 0xcd, 0x3a,
+  0xda, 0x77, 0xa8, 0x51, 0x42, 0x58, 0x05, 0x55, 0x39, 0xeb, 0xd1, 0x45,
+  0xb6, 0x6d, 0xdb, 0x76, 0x02, 0x15, 0x5b, 0x4f, 0xb9, 0x5a, 0x8c, 0x36,
+  0x9a, 0x63, 0x3e, 0x3c, 0xe6, 0x28, 0x72, 0x36, 0x51, 0x89, 0xdb, 0x3b,
+  0xfa, 0xe0, 0x07, 0x07, 0x30, 0xb3, 0x56, 0x39, 0x91, 0x42, 0x86, 0x38,
+  0xda, 0xd2, 0x8f, 0x67, 0x75, 0xca, 0x3e, 0x69, 0xe9, 0xd8, 0x07, 0x6f,
+  0x11, 0x11, 0x11, 0x11, 0xdc, 0xb9, 0x0c, 0x03, 0x9a, 0x63, 0x3e, 0x3c,
+  0x54, 0xdc, 0x52, 0x1f, 0xf3, 0xc8, 0xb6, 0x6b, 0x96, 0x31, 0xf8, 0x1b,
+  0x20, 0xee, 0x0b, 0x07, 0x4c, 0x37, 0x80, 0x4b, 0x31, 0x99, 0xd0, 0x09,
+  0xb8, 0xa5, 0x62, 0x5f, 0xa2, 0x72, 0xfb, 0x33, 0x11, 0xd8, 0x0e, 0x65,
+  0x00, 0x00, 0x00, 0x08, 0x9a, 0x8d, 0x4d, 0x53, 0xe6, 0x28, 0x72, 0x36,
+  0xf3, 0xc8, 0xb6, 0x6b, 0xef, 0x80, 0xab, 0x77, 0x4d, 0x49, 0x25, 0x2b,
+  0x7e, 0x10, 0x08, 0x1b, 0x70, 0x22, 0x72, 0x66, 0x8b, 0xe6, 0x06, 0x3a,
+  0x58, 0xb9, 0x7e, 0x02, 0x97, 0xf4, 0xc2, 0x4f, 0x6b, 0x9a, 0x68, 0x53,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x6e, 0xf2, 0x33, 0x15, 0x51, 0x89, 0xdb, 0x3b,
+  0x96, 0x31, 0xf8, 0x1b, 0x4d, 0x49, 0x25, 0x2b, 0xe2, 0xe0, 0x5c, 0x64,
+  0xb6, 0x1d, 0x73, 0x13, 0x38, 0x1b, 0xfd, 0x49, 0xe1, 0x2c, 0xce, 0x5d,
+  0x2a, 0x6b, 0xb4, 0x17, 0x7e, 0xa9, 0x6e, 0x72, 0x2f, 0x77, 0x47, 0x79,
+  0x38, 0x8e, 0xe3, 0x78, 0xed, 0x3f, 0x16, 0x06, 0xfa, 0xe0, 0x07, 0x07,
+  0x20, 0xee, 0x0b, 0x07, 0x7e, 0x10, 0x08, 0x1b, 0xb6, 0x1d, 0x73, 0x13,
+  0xca, 0x4a, 0x44, 0x68, 0x1c, 0x93, 0xbc, 0x37, 0xfa, 0x14, 0x8b, 0x55,
+  0xae, 0xe0, 0xac, 0x31, 0xcb, 0x04, 0x09, 0x46, 0x27, 0x8f, 0x96, 0x07,
+  0x28, 0xaf, 0xa1, 0x3c, 0x43, 0xab, 0x59, 0x54, 0x30, 0xb3, 0x56, 0x39,
+  0x4c, 0x37, 0x80, 0x4b, 0x70, 0x22, 0x72, 0x66, 0x38, 0x1b, 0xfd, 0x49,
+  0x1c, 0x93, 0xbc, 0x37, 0xfb, 0xdd, 0xff, 0x41, 0x73, 0x22, 0xa8, 0x31,
+  0xd4, 0xc3, 0x26, 0x2b, 0xe7, 0x8c, 0xce, 0x35, 0x03, 0x29, 0x9c, 0x43,
+  0xcc, 0xcc, 0xcc, 0x6c, 0x1a, 0x62, 0xcd, 0x3a, 0x91, 0x42, 0x86, 0x38,
+  0x31, 0x99, 0xd0, 0x09, 0x8b, 0xe6, 0x06, 0x3a, 0xe1, 0x2c, 0xce, 0x5d,
+  0xfa, 0x14, 0x8b, 0x55, 0x73, 0x22, 0xa8, 0x31, 0xaf, 0x9f, 0x0d, 0x2d,
+  0xd8, 0xf1, 0xd2, 0x43, 0x41, 0x60, 0x7a, 0x48, 0xca, 0xa1, 0x4c, 0x7c,
+  0x79, 0x9e, 0xe7, 0x79, 0xda, 0x77, 0xa8, 0x51, 0xda, 0xd2, 0x8f, 0x67,
+  0xb8, 0xa5, 0x62, 0x5f, 0x58, 0xb9, 0x7e, 0x02, 0x2a, 0x6b, 0xb4, 0x17,
+  0xae, 0xe0, 0xac, 0x31, 0xd4, 0xc3, 0x26, 0x2b, 0xd8, 0xf1, 0xd2, 0x43,
+  0x38, 0xc4, 0xc5, 0x55, 0x39, 0x3d, 0x1f, 0x4c, 0x81, 0xa8, 0x99, 0x14,
+  0xa2, 0x8b, 0x2e, 0x7a, 0x42, 0x58, 0x05, 0x55, 0x75, 0xca, 0x3e, 0x69,
+  0xa2, 0x72, 0xfb, 0x33, 0x97, 0xf4, 0xc2, 0x4f, 0x7e, 0xa9, 0x6e, 0x72,
+  0xcb, 0x04, 0x09, 0x46, 0xe7, 0x8c, 0xce, 0x35, 0x41, 0x60, 0x7a, 0x48,
+  0x39, 0x3d, 0x1f, 0x4c, 0xc3, 0x27, 0xbb, 0x1a, 0x86, 0xb4, 0x97, 0x00,
+  0xc8, 0x42, 0x16, 0x32, 0x39, 0xeb, 0xd1, 0x45, 0xe9, 0xd8, 0x07, 0x6f,
+  0x11, 0xd8, 0x0e, 0x65, 0x6b, 0x9a, 0x68, 0x53, 0x2f, 0x77, 0x47, 0x79,
+  0x27, 0x8f, 0x96, 0x07, 0x03, 0x29, 0x9c, 0x43, 0xca, 0xa1, 0x4c, 0x7c,
+  0x81, 0xa8, 0x99, 0x14, 0x86, 0xb4, 0x97, 0x00, 0x0c, 0xd8, 0x29, 0x37,
+  0x55, 0x55, 0x55, 0x35, 0xcc, 0xab, 0xe7, 0x58, 0x82, 0xaa, 0xb7, 0x06,
+  0x3c, 0x2a, 0x3d, 0x61, 0x45, 0xbd, 0xcc, 0x4b, 0xa9, 0x83, 0x44, 0x56,
+  0x16, 0xe6, 0x58, 0x6e, 0x70, 0x4b, 0x3a, 0x44, 0xe2, 0x3b, 0x37, 0x60,
+  0xf0, 0x3b, 0x41, 0x1e, 0x44, 0x40, 0x84, 0x5a, 0x63, 0x5d, 0x4d, 0x78,
+  0x22, 0x80, 0xb3, 0x0f, 0xe0, 0x85, 0xec, 0x77, 0xe5, 0x3d, 0xda, 0x27,
+  0x55, 0xf9, 0xfd, 0x44, 0x38, 0xa7, 0x0f, 0x0a, 0x2f, 0xec, 0xda, 0x34,
+  0x24, 0xef, 0x00, 0x40, 0x54, 0x9a, 0x0b, 0x27, 0xf9, 0x85, 0xf4, 0x16,
+  0x14, 0x1f, 0x17, 0x30, 0x1d, 0xb0, 0xdf, 0x31, 0x55, 0x55, 0x55, 0x35,
+  0x98, 0x36, 0x7e, 0x31, 0xd0, 0xda, 0x0a, 0x16, 0xae, 0xb0, 0x6a, 0x00,
+  0x0e, 0x7a, 0x7e, 0x6d, 0x93, 0x81, 0x4d, 0x21, 0x45, 0x5a, 0x4d, 0x20,
+  0x42, 0x5d, 0xfd, 0x49, 0x28, 0xc5, 0xe2, 0x75, 0x45, 0x85, 0x03, 0x2c,
+  0xfc, 0x78, 0x72, 0x15, 0x98, 0x9c, 0x88, 0x0b, 0xed, 0x8f, 0x6f, 0x2b,
+  0x55, 0x75, 0x17, 0x5f, 0xe5, 0xed, 0x21, 0x52, 0x5a, 0x34, 0x10, 0x7d,
+  0x42, 0x25, 0x57, 0x6a, 0xa4, 0xb2, 0xe6, 0x2e, 0x05, 0xa8, 0xc4, 0x17,
+  0xff, 0x9c, 0x7f, 0x6f, 0x23, 0x64, 0x17, 0x44, 0x85, 0xa9, 0x6b, 0x46,
+  0x66, 0x58, 0x1b, 0x3b, 0x55, 0x55, 0x55, 0x35, 0x55, 0xf6, 0xca, 0x06,
+  0x68, 0x75, 0xa9, 0x55, 0x54, 0x44, 0x4f, 0x61, 0x65, 0x3b, 0x96, 0x37,
+  0xa9, 0x89, 0xb6, 0x47, 0x70, 0x8a, 0x8d, 0x74, 0x09, 0x53, 0x9e, 0x5e,
+  0x92, 0x56, 0x2b, 0x34, 0x3e, 0x9d, 0x12, 0x0a, 0x54, 0x98, 0xf8, 0x29,
+  0xde, 0xa0, 0xdd, 0x11, 0x46, 0x3e, 0x0f, 0x70, 0xff, 0xee, 0x0d, 0x7c,
+  0x48, 0xe0, 0xe1, 0x6d, 0xb6, 0x5a, 0x2f, 0x7c, 0xb1, 0xb2, 0xf7, 0x2f,
+  0xda, 0x64, 0x33, 0x7e, 0x87, 0x48, 0x48, 0x7e, 0x95, 0x6c, 0xd5, 0x5c,
+  0x26, 0x8f, 0xc9, 0x3e, 0xf9, 0x5e, 0x99, 0x38, 0xf5, 0x32, 0xc2, 0x66,
+  0x55, 0x55, 0x55, 0x35, 0x7f, 0xb1, 0x0f, 0x47, 0xac, 0x5d, 0xec, 0x76,
+  0xba, 0x59, 0xc4, 0x7f, 0xfb, 0xdc, 0x32, 0x46, 0xe8, 0x83, 0xe0, 0x0a,
+  0xf4, 0xb8, 0x56, 0x36, 0x07, 0x4f, 0x7f, 0x29, 0x31, 0xb8, 0xf4, 0x2c,
+  0x7e, 0x42, 0xbd, 0x3e, 0xf1, 0x9d, 0x40, 0x73, 0x51, 0xf1, 0xce, 0x31,
+  0x35, 0x7b, 0x0e, 0x48, 0x9e, 0xb9, 0x6e, 0x3b, 0x37, 0x00, 0x57, 0x0c,
+  0x15, 0x25, 0x74, 0x64, 0xdd, 0x39, 0x64, 0x5c, 0x0a, 0x5d, 0x08, 0x2b,
+  0xf5, 0xe6, 0x0c, 0x3f, 0xe6, 0xce, 0x30, 0x2d, 0x27, 0xc4, 0x07, 0x19,
+  0x82, 0xfb, 0x44, 0x08, 0x7b, 0x94, 0x23, 0x69, 0x55, 0x55, 0x55, 0x35,
+  0xc7, 0xbe, 0xaf, 0x49, 0xa6, 0x9a, 0x26, 0x30, 0x7c, 0xb2, 0x66, 0x35,
+  0xe4, 0x83, 0x46, 0x62, 0xe3, 0x1c, 0x23, 0x07, 0x36, 0x2e, 0xd3, 0x00,
+  0xe2, 0x65, 0xc8, 0x51, 0x0c, 0x09, 0x5c, 0x74, 0x13, 0x94, 0xf9, 0x67,
+  0x4e, 0x07, 0x26, 0x03, 0xba, 0xb4, 0x3a, 0x7f, 0x38, 0xb4, 0x7c, 0x6a,
+  0x44, 0x7a, 0x1c, 0x7b, 0xeb, 0xf9, 0x8b, 0x0b, 0x16, 0xf8, 0x23, 0x36,
+  0x7b, 0x89, 0x79, 0x44, 0x80, 0xfe, 0x33, 0x2a, 0x7d, 0x59, 0xe2, 0x1b,
+  0x7b, 0xe1, 0xb0, 0x15, 0x21, 0xcb, 0x47, 0x77, 0x23, 0x1a, 0xc0, 0x14,
+  0x5b, 0x86, 0x06, 0x2d, 0x55, 0x55, 0x55, 0x35, 0x04, 0xb5, 0x47, 0x27,
+  0x1d, 0xb7, 0x22, 0x44, 0xcc, 0x9e, 0xce, 0x7d, 0xf2, 0x75, 0x78, 0x78,
+  0x7b, 0x98, 0x99, 0x12, 0xbd, 0x34, 0xe4, 0x43, 0xf0, 0x0a, 0x96, 0x43,
+  0xf1, 0x50, 0x1d, 0x0b, 0x86, 0x78, 0xc9, 0x59, 0xc7, 0x78, 0xec, 0x16,
+  0x71, 0xaa, 0x0c, 0x56, 0xbf, 0x92, 0xe2, 0x3a, 0xb5, 0x6e, 0x2d, 0x18,
+  0xe2, 0xc7, 0x31, 0x67, 0x10, 0xab, 0x9f, 0x27, 0x27, 0x1e, 0xf3, 0x69,
+  0xaf, 0x57, 0x42, 0x4c, 0x4f, 0xb4, 0x30, 0x35, 0x00, 0x54, 0xb0, 0x4a,
+  0xa2, 0x00, 0x2a, 0x4a, 0x3d, 0x49, 0x58, 0x73, 0xf9, 0x16, 0xb0, 0x01,
+  0x55, 0x55, 0x55, 0x35, 0xe4, 0xd5, 0x3f, 0x2e, 0xee, 0x84, 0x47, 0x51,
+  0x3f, 0x84, 0xb9, 0x6b, 0x49, 0xb9, 0xae, 0x57, 0x32, 0x5a, 0x04, 0x02,
+  0xe1, 0x6a, 0xf1, 0x4b, 0x30, 0x53, 0xf1, 0x05, 0x29, 0x74, 0x75, 0x76,
+  0x4a, 0x15, 0x5b, 0x5d, 0xe1, 0xaa, 0x15, 0x1b, 0x62, 0xf5, 0xe8, 0x76,
+  0x03, 0xc1, 0xaa, 0x06, 0x13, 0x59, 0xc8, 0x40, 0x84, 0x49, 0xc8, 0x1f,
+  0x85, 0x98, 0x55, 0x6b, 0xed, 0x38, 0x45, 0x17, 0xb8, 0xc7, 0xf7, 0x69,
+  0xc3, 0x87, 0xd0, 0x17, 0x0a, 0x93, 0xb7, 0x35, 0xc2, 0x45, 0x75, 0x34,
+  0x7a, 0x78, 0xff, 0x51, 0x26, 0xd2, 0x59, 0x13, 0x55, 0x55, 0x55, 0x35,
+  0x48, 0x38, 0xf7, 0x6e, 0x4f, 0x7d, 0xc7, 0x70, 0x32, 0x5d, 0x5b, 0x7a,
+  0x85, 0x35, 0x9c, 0x07, 0x40, 0x08, 0x30, 0x5c, 0x64, 0x69, 0x27, 0x7a,
+  0x07, 0x34, 0x90, 0x6c, 0x6e, 0xa6, 0x8e, 0x70, 0xd4, 0xf2, 0xf7, 0x59,
+  0x0f, 0x13, 0x17, 0x5d, 0xa8, 0xa9, 0x01, 0x29, 0xad, 0xfd, 0x9a, 0x77,
+  0x3c, 0x77, 0xc7, 0x67, 0xd0, 0x43, 0xb1, 0x3f, 0x97, 0x76, 0xe4, 0x72,
+  0xd4, 0x82, 0x9a, 0x25, 0xec, 0xef, 0xc3, 0x03, 0xdc, 0xf9, 0x94, 0x3f,
+  0xa4, 0x76, 0x88, 0x5a, 0xb8, 0x0f, 0x03, 0x76, 0x58, 0x87, 0x42, 0x11,
+  0x28, 0xb7, 0xb0, 0x1d, 0x55, 0x55, 0x55, 0x35, 0x2f, 0xe6, 0x44, 0x75,
+  0xf3, 0x0b, 0xe8, 0x68, 0x59, 0x72, 0x1f, 0x16, 0x8c, 0xd0, 0xe3, 0x3c,
+  0xcc, 0xfc, 0x77, 0x05, 0xd6, 0x4b, 0x48, 0x78, 0x51, 0x88, 0x4c, 0x5f,
+  0x30, 0x43, 0x9c, 0x2f, 0x49, 0x72, 0xba, 0x01, 0xba, 0xae, 0xfe, 0x0b,
+  0x94, 0x3f, 0xe7, 0x71, 0x9d, 0xfa, 0x37, 0x06, 0xfc, 0xa2, 0x99, 0x6f,
+  0xe2, 0x0d, 0xcf, 0x4b, 0x63, 0x76, 0xec, 0x49, 0xa8, 0xb5, 0x84, 0x0b,
+  0x84, 0xa3, 0x75, 0x4f, 0x5e, 0x56, 0xdd, 0x37, 0x1a, 0x7d, 0x6e, 0x34,
+  0x95, 0x39, 0x80, 0x1e, 0x58, 0x2e, 0x22, 0x50, 0xd3, 0x46, 0x93, 0x1e,
+  0x55, 0x55, 0x55, 0x35, 0xf5, 0x96, 0x5a, 0x5f, 0x9b, 0xc8, 0x58, 0x50,
+  0x3e, 0x03, 0xab, 0x16, 0xd5, 0xc6, 0x4c, 0x7f, 0x3f, 0x82, 0xf6, 0x34,
+  0x1c, 0x29, 0x22, 0x16, 0x40, 0xdb, 0xe7, 0x71, 0x8b, 0x8a, 0x4b, 0x55,
+  0x45, 0xbf, 0xd1, 0x68, 0x4c, 0xbb, 0xe3, 0x43, 0x1b, 0x96, 0x28, 0x3d,
+  0x36, 0x4f, 0xdb, 0x58, 0xa8, 0x39, 0xac, 0x38, 0xd3, 0xeb, 0x90, 0x18,
+  0x2f, 0xb7, 0x06, 0x1a, 0x5a, 0x82, 0x53, 0x13, 0x77, 0xaf, 0xe0, 0x4d,
+  0x9e, 0xe9, 0x39, 0x79, 0xb7, 0xf6, 0xa2, 0x3c, 0x41, 0x9d, 0x14, 0x59,
+  0x01, 0x33, 0x36, 0x20, 0x15, 0xe0, 0xe4, 0x15, 0x55, 0x55, 0x55, 0x35,
+  0x58, 0x48, 0x07, 0x36, 0x3f, 0x43, 0x1e, 0x05, 0x33, 0x9e, 0x14, 0x45,
+  0x69, 0xc8, 0x16, 0x63, 0x5f, 0xab, 0x77, 0x26, 0xf4, 0x08, 0xb0, 0x2e,
+  0xf8, 0x31, 0x79, 0x29, 0x37, 0xc9, 0x37, 0x28, 0x55, 0x62, 0xcc, 0x43,
+  0xeb, 0x6b, 0xe4, 0x03, 0xfe, 0x82, 0x50, 0x20, 0x2d, 0xdf, 0xf2, 0x7d,
+  0xba, 0x07, 0xe2, 0x0e, 0x88, 0x1e, 0x82, 0x2b, 0x87, 0x54, 0x26, 0x39,
+  0xdd, 0xee, 0x3e, 0x0b, 0xdc, 0xbf, 0x93, 0x1a, 0x8a, 0xce, 0xa6, 0x39,
+  0x5b, 0xaf, 0x8f, 0x00, 0x7a, 0xad, 0x27, 0x71, 0x1e, 0x76, 0xd8, 0x58,
+  0x96, 0x36, 0xa3, 0x14, 0x55, 0x55, 0x55, 0x35, 0x76, 0x27, 0x76, 0x62,
+  0xa4, 0x9f, 0x05, 0x5a, 0x41, 0x28, 0x49, 0x12, 0x24, 0x18, 0x49, 0x12,
+  0x4f, 0xc2, 0xa5, 0x25, 0x0e, 0x0e, 0x3c, 0x3c, 0x01, 0xa7, 0x65, 0x00,
+  0x92, 0x9e, 0x17, 0x36, 0xa1, 0x7a, 0x92, 0x27, 0xcf, 0x74, 0xba, 0x4d,
+  0xcb, 0x6f, 0x66, 0x68, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
+  0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
+  0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32
+    };
+} // namespace poseidon_constants
+#endif
--- a/icicle/include/poseidon/kernels.cuh
+++ b/icicle/include/poseidon/kernels.cuh
@@ -1,9 +1,13 @@
-#include "poseidon/poseidon.cuh"
+#pragma once
+#ifndef POSEIDON_KERNELS_H
+#define POSEIDON_KERNELS_H
+
 #include "gpu-utils/modifiers.cuh"
+#include "poseidon/constants.cuh"

 namespace poseidon {
  template <typename S, int T>
-  __global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
+  __global__ void prepare_poseidon_states(const S* input, S* states, unsigned int number_of_states, const S domain_tag)
  {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int state_number = idx / T;
@@ -16,27 +20,27 @@ namespace poseidon {
    if (element_number == 0) {
      prepared_element = domain_tag;
    } else {
-      if (aligned) {
-        prepared_element = states[idx];
-      } else {
-        prepared_element = states[idx - 1];
-      }
+      prepared_element = input[idx - state_number - 1];
    }

-    // We need __syncthreads here if the state is not aligned
-    // because then we need to shift the vector [A, B, 0] -> [D, A, B]
-    if (!aligned) { __syncthreads(); }
-
    // Store element in state
    states[idx] = prepared_element;
  }

  template <typename S>
-  DEVICE_INLINE S sbox_alpha_five(S element)
+  DEVICE_INLINE S sbox_el(S element, const int alpha)
  {
-    S result = S::sqr(element);
-    result = S::sqr(result);
-    return result * element;
+    S result2 = S::sqr(element);
+    switch (alpha) {
+    case 3:
+      return result2 * element;
+    case 5:
+      return S::sqr(result2) * element;
+    case 7:
+      return S::sqr(result2) * result2 * element;
+    case 11:
+      return S::sqr(S::sqr(result2)) * result2 * element;
+    }
  }

  template <typename S, int T>
@@ -71,7 +75,7 @@ namespace poseidon {
      element = element + constants.round_constants[rc_offset + element_number];
      rc_offset += T;
    }
-    element = sbox_alpha_five(element);
+    element = sbox_el(element, constants.alpha);
    if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }

    // Multiply all the states by mds matrix
@@ -111,7 +115,7 @@ namespace poseidon {
  __device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
  {
    S element = state[0];
-    element = sbox_alpha_five(element);
+    element = sbox_el(element, constants.alpha);
    element = element + constants.round_constants[rc_offset];

    S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
@@ -155,22 +159,58 @@ namespace poseidon {
    }
  }

-  // These function is just doing copy from the states to the output
  template <typename S, int T>
-  __global__ void get_hash_results(S* states, size_t number_of_states, S* out)
+  __global__ void
+  squeeze_states_kernel(const S* states, unsigned int number_of_states, unsigned int rate, unsigned int offset, S* out)
  {
    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (idx >= number_of_states) { return; }

-    out[idx] = states[idx * T + 1];
+    for (int i = 0; i < rate; i++) {
+      out[idx * rate + i] = states[idx * T + offset + i];
+    }
  }

  template <typename S, int T>
-  __global__ void copy_recursive(S* state, size_t number_of_states, S* out)
+  cudaError_t poseidon_permutation_kernel(
+    const S* input,
+    S* out,
+    unsigned int number_of_states,
+    unsigned int input_len,
+    unsigned int output_len,
+    const PoseidonConstants<S>& constants,
+    cudaStream_t& stream)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
+    S* states;
+    CHK_IF_RETURN(cudaMallocAsync(&states, number_of_states * T * sizeof(S), stream));

-    state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
+    prepare_poseidon_states<S, T>
+      <<<PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T), 0, stream>>>(
+        input, states, number_of_states, constants.domain_tag);
+
+    size_t rc_offset = 0;
+    full_rounds<S, T><<<
+      PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T),
+      sizeof(S) * PKC::hashes_per_block(T) * T, stream>>>(
+      states, number_of_states, rc_offset, FIRST_FULL_ROUNDS, constants);
+    rc_offset += T * (constants.full_rounds_half + 1);
+
+    partial_rounds<S, T><<<PKC::number_of_singlehash_blocks(number_of_states), PKC::singlehash_block_size, 0, stream>>>(
+      states, number_of_states, rc_offset, constants);
+    rc_offset += constants.partial_rounds;
+
+    full_rounds<S, T><<<
+      PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T),
+      sizeof(S) * PKC::hashes_per_block(T) * T, stream>>>(
+      states, number_of_states, rc_offset, SECOND_FULL_ROUNDS, constants);
+
+    squeeze_states_kernel<S, T>
+      <<<PKC::number_of_singlehash_blocks(number_of_states), PKC::singlehash_block_size, 0, stream>>>(
+        states, number_of_states, output_len, 1, out);
+
+    CHK_IF_RETURN(cudaFreeAsync(states, stream));
+    return CHK_LAST();
  }
-} // namespace poseidon
+} // namespace poseidon
+
+#endif
--- a/icicle/include/poseidon/poseidon.cuh
+++ b/icicle/include/poseidon/poseidon.cuh
@@ -8,132 +8,87 @@
 #include "gpu-utils/error_handler.cuh"
 #include "utils/utils.h"

+#include "poseidon/kernels.cuh"
+#include "poseidon/constants.cuh"
+#include "hash/hash.cuh"
+using namespace hash;
+
 /**
 * @namespace poseidon
 * Implementation of the [Poseidon hash function](https://eprint.iacr.org/2019/458.pdf)
 * Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
 */
 namespace poseidon {
-#define FIRST_FULL_ROUNDS  true
-#define SECOND_FULL_ROUNDS false
-
-  /**
-   * For most of the Poseidon configurations this is the case
-   * TODO: Add support for different full rounds numbers
-   */
-  const int FULL_ROUNDS_DEFAULT = 4;
-
-  /**
-   * @struct PoseidonConstants
-   * This constants are enough to define a Poseidon instantce
-   * @param round_constants A pointer to round constants allocated on the device
-   * @param mds_matrix A pointer to an mds matrix allocated on the device
-   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
-   * @param sparse_matrices A pointer to sparse matrices allocated on the device
-   */
  template <typename S>
-  struct PoseidonConstants {
-    int arity;
-    int partial_rounds;
-    int full_rounds_half;
-    S* round_constants = nullptr;
-    S* mds_matrix = nullptr;
-    S* non_sparse_matrix = nullptr;
-    S* sparse_matrices = nullptr;
-    S domain_tag;
-  };
-
-  /**
-   * @class PoseidonKernelsConfiguration
-   * Describes the logic of deriving CUDA kernels parameters
-   * such as the number of threads and the number of blocks
-   */
-  template <int T>
-  class PoseidonKernelsConfiguration
+  class Poseidon : public SpongeHasher<S, S>
  {
  public:
-    // The logic behind this is that 1 thread only works on 1 element
-    // We have {T} elements in each state, and {number_of_states} states total
-    static const int number_of_threads = 256 / T * T;
+    const std::size_t device_id;
+    PoseidonConstants<S> constants;

-    // The partial rounds operates on the whole state, so we define
-    // the parallelism params for processing a single hash preimage per thread
-    static const int singlehash_block_size = 128;
-
-    static const int hashes_per_block = number_of_threads / T;
-
-    static int number_of_full_blocks(size_t number_of_states)
+    cudaError_t run_hash_many_kernel(
+      const S* input,
+      S* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const override
    {
-      int total_number_of_threads = number_of_states * T;
-      return total_number_of_threads / number_of_threads +
-             static_cast<bool>(total_number_of_threads % number_of_threads);
+      cudaError_t permutation_error;
+#define P_PERM_T(width)                                                                                                \
+  case width:                                                                                                          \
+    permutation_error = poseidon_permutation_kernel<S, width>(                                                         \
+      input, output, number_of_states, input_len, output_len, this->constants, ctx.stream);                            \
+    break;
+
+      switch (this->width) {
+        P_PERM_T(3)
+        P_PERM_T(5)
+        P_PERM_T(9)
+        P_PERM_T(12)
+      default:
+        THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [3, 5, 9, 12]");
+      }
+
+      CHK_IF_RETURN(permutation_error);
+      return CHK_LAST();
    }

-    static int number_of_singlehash_blocks(size_t number_of_states)
+    Poseidon(
+      unsigned int arity,
+      unsigned int alpha,
+      unsigned int partial_rounds,
+      unsigned int full_rounds_half,
+      const S* round_constants,
+      const S* mds_matrix,
+      const S* non_sparse_matrix,
+      const S* sparse_matrices,
+      const S domain_tag,
+      device_context::DeviceContext& ctx)
+        : SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
    {
-      return number_of_states / singlehash_block_size + static_cast<bool>(number_of_states % singlehash_block_size);
+      PoseidonConstants<S> constants;
+      CHK_STICKY(create_optimized_poseidon_constants(
+        arity, alpha, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
+        domain_tag, &constants, ctx));
+      this->constants = constants;
+    }
+
+    Poseidon(int arity, device_context::DeviceContext& ctx)
+        : SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
+    {
+      PoseidonConstants<S> constants{};
+      CHK_STICKY(init_optimized_poseidon_constants(arity, ctx, &constants));
+      this->constants = constants;
+    }
+
+    ~Poseidon()
+    {
+      auto ctx = device_context::get_default_device_context();
+      ctx.device_id = this->device_id;
+      CHK_STICKY(release_optimized_poseidon_constants<S>(&this->constants, ctx));
    }
  };
-
-  template <int T>
-  using PKC = PoseidonKernelsConfiguration<T>;
-
-  /**
-   * @struct PoseidonConfig
-   * Struct that encodes various Poseidon parameters.
-   */
-  struct PoseidonConfig {
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
-    bool are_inputs_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
-    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
-    bool input_is_a_state;      /**< If true, input is considered to be a states vector, holding the preimages
-                                 * in aligned or not aligned format. Memory under the input pointer will be used for states
-                                 * If false, fresh states memory will be allocated and input will be copied into it */
-    bool aligned;               /**< If true - input should be already aligned for poseidon permutation.
-                                 * Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
-                                 * not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D) */
-    bool loop_state;            /**< If true, hash results will also be copied in the input pointer in aligned format */
-    bool is_async; /**< Whether to run the Poseidon asynchronously. If set to `true`, the poseidon_hash function will be
-                    *   non-blocking and you'd need to synchronize it explicitly by running
-                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
-                    *   function will block the current CPU thread. */
-  };
-
-  static PoseidonConfig default_poseidon_config(
-    int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
-  {
-    PoseidonConfig config = {
-      ctx,   // ctx
-      false, // are_inputes_on_device
-      false, // are_outputs_on_device
-      false, // input_is_a_state
-      false, // aligned
-      false, // loop_state
-      false, // is_async
-    };
-    return config;
-  }
-
-  /**
-   * Loads pre-calculated optimized constants, moves them to the device
-   */
-  template <typename S>
-  cudaError_t
-  init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
-
-  /**
-   * Compute the poseidon hash over a sequence of preimages.
-   * Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
-   * @param T size of the poseidon state, should be equal to {arity + 1}
-   * @param input a pointer to the input data. May be allocated on device or on host, regulated
-   * by the config. May point to a string of preimages or a string of states filled with preimages.
-   * @param output a pointer to the output data. May be allocated on device or on host, regulated
-   * by the config. Must be at least of size [number_of_states](@ref number_of_states)
-   * @param number_of_states number of input blocks of size T-1 (arity)
-   */
-  template <typename S, int T>
-  cudaError_t poseidon_hash(
-    S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config);
 } // namespace poseidon

 #endif
--- a/icicle/include/poseidon/tree/merkle.cuh
+++ b/icicle/include/poseidon/tree/merkle.cuh
@@ -1,74 +0,0 @@
-#pragma once
-#ifndef MERKLE_H
-#define MERKLE_H
-
-#include "gpu-utils/device_context.cuh"
-#include "gpu-utils/error_handler.cuh"
-#include "utils/utils.h"
-#include "poseidon/poseidon.cuh"
-
-#include <iostream>
-#include <math.h>
-
-using namespace poseidon;
-
-/**
- * @namespace merkle
- * Implementation of the [Poseidon](@ref poseidon) [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree) builder,
- * parallelized for the use on GPU
- */
-namespace merkle {
-  static constexpr size_t GIGA = 1024 * 1024 * 1024;
-
-  /// Bytes per stream
-  static constexpr size_t STREAM_CHUNK_SIZE = 1024 * 1024 * 1024;
-
-  /**
-   * @struct TreeBuilderConfig
-   * Struct that encodes various Tree builder parameters.
-   */
-  struct TreeBuilderConfig {
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
-    int keep_rows; /**< How many rows of the Merkle tree rows should be written to output. '0' means all of them */
-    bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
-    bool is_async; /**< Whether to run the tree builder asynchronously. If set to `true`, the build_merkle_tree
-                    *   function will be non-blocking and you'd need to synchronize it explicitly by running
-                    *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
-                    *   function will block the current CPU thread. */
-  };
-
-  static TreeBuilderConfig
-  default_merkle_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
-  {
-    TreeBuilderConfig config = {
-      ctx,   // ctx
-      0,     // keep_rows
-      false, // are_inputes_on_device
-      false, // is_async
-    };
-    return config;
-  }
-
-  /**
-   * Builds the Poseidon Merkle tree
-   *
-   * @param leaves a pointer to the leaves layer. May be allocated on device or on host, regulated by the config
-   * Expected to have arity ^ (height - 1) elements
-   * @param digests a pointer to the digests storage. May only be allocated on the host
-   * Expected to have `sum(arity ^ (i)) for i in [0..height-1]`
-   * @param height the height of the merkle tree
-   * # Algorithm
-   * The function will split large tree into many subtrees of size that will fit `STREAM_CHUNK_SIZE`.
-   * Each subtree is build in it's own stream (there is a maximum number of streams)
-   * After all subtrees are constructed - the function will combine the resulting sub-digests into the final top-tree
-   */
-  template <typename S, int T>
-  cudaError_t build_merkle_tree(
-    const S* leaves,
-    S* digests,
-    uint32_t height,
-    const PoseidonConstants<S>& poseidon,
-    const TreeBuilderConfig& config);
-} // namespace merkle
-
-#endif
--- a/icicle/include/poseidon2/constants.cuh
+++ b/icicle/include/poseidon2/constants.cuh
@@ -0,0 +1,65 @@
+#pragma once
+#ifndef POSEIDON2_CONSTANTS_H
+#define POSEIDON2_CONSTANTS_H
+
+#include "gpu-utils/device_context.cuh"
+
+namespace poseidon2 {
+  /**
+   * For most of the Poseidon2 configurations this is the case
+   */
+  const int EXTERNAL_ROUNDS_DEFAULT = 8;
+
+  enum DiffusionStrategy {
+    DEFAULT_DIFFUSION,
+    MONTGOMERY,
+  };
+
+  enum MdsType { DEFAULT_MDS, PLONKY };
+
+  /**
+   * @struct Poseidon2Constants
+   * This constants are enough to define a Poseidon2 instantce
+   * @param round_constants A pointer to round constants allocated on the device
+   * @param mds_matrix A pointer to an mds matrix allocated on the device
+   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
+   * @param sparse_matrices A pointer to sparse matrices allocated on the device
+   */
+  template <typename S>
+  struct Poseidon2Constants {
+    int width;
+    int alpha;
+    int internal_rounds;
+    int external_rounds;
+    S* round_constants = nullptr;
+    S* internal_matrix_diag = nullptr;
+    MdsType mds_type;
+    DiffusionStrategy diffusion;
+  };
+
+  template <typename S>
+  cudaError_t create_poseidon2_constants(
+    int width,
+    int alpha,
+    int internal_rounds,
+    int external_rounds,
+    const S* round_constants,
+    const S* internal_matrix_diag,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon_constants);
+
+  template <typename S>
+  cudaError_t init_poseidon2_constants(
+    int width,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon2_constants);
+
+  template <typename S>
+  cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx);
+} // namespace poseidon2
+
+#endif
--- a/icicle/include/poseidon2/constants/m31_poseidon2.h
+++ b/icicle/include/poseidon2/constants/m31_poseidon2.h
--- a/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
+++ b/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
@@ -3,13 +3,14 @@ from sage.rings.polynomial.polynomial_gf2x import GF2X_BuildIrred_list
 from math import *
 import itertools

-CURVE_NAME = "bn254"
+CURVE_NAME = "m31"

 ###########################################################################
 # p = 18446744069414584321 # GoldiLocks
 # p = 2013265921 # BabyBear
+p = 2**31 - 1 # M31
 # p = 52435875175126190479447740508185965837690552500527637822603658699938581184513 # BLS12-381
-p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
+# p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
 # p = 28948022309329048855892746252171976963363056481941560715954676764349967630337 # Pasta (Pallas)
 # p = 28948022309329048855892746252171976963363056481941647379679742748393362948097 # Pasta (Vesta)

@@ -617,6 +618,8 @@ print(f"namespace poseidon2_constants_{CURVE_NAME} {{")
 for t in TS:
    NUM_CELLS = t
    R_F_FIXED, R_P_FIXED, _, _ = poseidon_calc_final_numbers_fixed(p, t, alpha, 128, True)
+    if t == 16:
+        R_P_FIXED = 14

    INIT_SEQUENCE = []

--- a/icicle/include/poseidon2/kernels.cuh
+++ b/icicle/include/poseidon2/kernels.cuh
@@ -1,7 +1,28 @@
-#include "poseidon/poseidon.cuh"
+#pragma once
+#ifndef POSEIDON2_KERNELS_H
+#define POSEIDON2_KERNELS_H
+
+#include "utils/utils.h"
+#include "hash/hash.cuh"
+#include "matrix/matrix.cuh"
+#include "poseidon2/constants.cuh"
 #include "gpu-utils/modifiers.cuh"

+using matrix::Matrix;
+
 namespace poseidon2 {
+  static DEVICE_INLINE unsigned int d_next_pow_of_two(unsigned int v)
+  {
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+  }
+
  template <typename S>
  DEVICE_INLINE S sbox_el(S element, const int alpha)
  {
@@ -19,7 +40,7 @@ namespace poseidon2 {
  }

  template <typename S, int T>
-  DEVICE_INLINE S sbox(S state[T], const int alpha)
+  DEVICE_INLINE void sbox(S state[T], const int alpha)
  {
    for (int i = 0; i < T; i++) {
      state[i] = sbox_el(state[i], alpha);
@@ -27,7 +48,7 @@ namespace poseidon2 {
  }

  template <typename S, int T>
-  DEVICE_INLINE S add_rc(S state[T], size_t rc_offset, const S* rc)
+  DEVICE_INLINE void add_rc(S state[T], size_t rc_offset, const S* rc)
  {
    for (int i = 0; i < T; i++) {
      state[i] = state[i] + rc[rc_offset + i];
@@ -35,7 +56,7 @@ namespace poseidon2 {
  }

  template <typename S>
-  __device__ S mds_light_4x4(S s[4])
+  __device__ void mds_light_4x4(S s[4])
  {
    S t0 = s[0] + s[1];
    S t1 = s[2] + s[3];
@@ -56,7 +77,7 @@ namespace poseidon2 {
  // [ 3 1 1 2 ].
  // https://github.com/Plonky3/Plonky3/blob/main/poseidon2/src/matrix.rs#L36
  template <typename S>
-  __device__ S mds_light_plonky_4x4(S s[4])
+  __device__ void mds_light_plonky_4x4(S s[4])
  {
    S t01 = s[0] + s[1];
    S t23 = s[2] + s[3];
@@ -70,7 +91,7 @@ namespace poseidon2 {
  }

  template <typename S, int T>
-  __device__ S mds_light(S state[T], MdsType mds)
+  __device__ void mds_light(S state[T], MdsType mds)
  {
    S sum;
    switch (T) {
@@ -123,7 +144,7 @@ namespace poseidon2 {
  }

  template <typename S, int T>
-  __device__ S internal_round(S state[T], size_t rc_offset, const Poseidon2Constants<S>& constants)
+  __device__ void internal_round(S state[T], size_t rc_offset, const Poseidon2Constants<S>& constants)
  {
    S element = state[0];
    element = element + constants.round_constants[rc_offset];
@@ -176,17 +197,8 @@ namespace poseidon2 {
  }

  template <typename S, int T>
-  __global__ void poseidon2_permutation_kernel(
-    const S* states, S* states_out, size_t number_of_states, const Poseidon2Constants<S> constants)
+  __device__ void permute_state(S state[T], const Poseidon2Constants<S>& constants)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (idx >= number_of_states) { return; }
-
-    S state[T];
-    UNROLL
-    for (int i = 0; i < T; i++) {
-      state[i] = states[idx * T + i];
-    }
    unsigned int rn;

    mds_light<S, T>(state, constants.mds_type);
@@ -213,6 +225,22 @@ namespace poseidon2 {
      mds_light<S, T>(state, constants.mds_type);
      rc_offset += T;
    }
+  }
+
+  template <typename S, int T>
+  __global__ void permutation_kernel(
+    const S* states, S* states_out, unsigned int number_of_states, const Poseidon2Constants<S> constants)
+  {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= number_of_states) { return; }
+
+    S state[T];
+    UNROLL
+    for (int i = 0; i < T; i++) {
+      state[i] = states[idx * T + i];
+    }
+
+    permute_state<S, T>(state, constants);

    UNROLL
    for (int i = 0; i < T; i++) {
@@ -220,13 +248,120 @@ namespace poseidon2 {
    }
  }

-  // These function is just doing copy from the states to the output
  template <typename S, int T>
-  __global__ void get_hash_results(const S* states, size_t number_of_states, int index, S* out)
+  __global__ void hash_many_kernel(
+    const S* input,
+    S* output,
+    uint64_t number_of_states,
+    unsigned int input_len,
+    unsigned int output_len,
+    const Poseidon2Constants<S> constants)
  {
-    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    uint64_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (idx >= number_of_states) { return; }

-    out[idx] = states[idx * T + index];
+    S state[T] = {0};
+    UNROLL
+    for (int i = 0; i < input_len; i++) {
+      state[i] = input[idx * input_len + i];
+    }
+
+    permute_state<S, T>(state, constants);
+
+    UNROLL
+    for (int i = 0; i < output_len; i++) {
+      output[idx * output_len + i] = state[i];
+    }
  }
-} // namespace poseidon2
+
+  template <typename S, int T>
+  __device__ void absorb_2d_state(
+    const Matrix<S>* inputs,
+    S state[T],
+    unsigned int number_of_inputs,
+    unsigned int rate,
+    uint64_t row_idx,
+    const Poseidon2Constants<S>& constants)
+  {
+    unsigned int index = 0;
+    for (int i = 0; i < number_of_inputs; i++) {
+      const Matrix<S>* input = inputs + i;
+      for (int j = 0; j < input->width; j++) {
+        state[index] = input->values[row_idx * input->width + j];
+        index++;
+        if (index == rate) {
+          permute_state<S, T>(state, constants);
+          index = 0;
+        }
+      }
+    }
+
+    if (index) { permute_state<S, T>(state, constants); }
+  }
+
+  template <typename S, int T>
+  __global__ void hash_2d_kernel(
+    const Matrix<S>* inputs,
+    S* output,
+    unsigned int number_of_inputs,
+    unsigned int rate,
+    unsigned int output_len,
+    const Poseidon2Constants<S> constants)
+  {
+    uint64_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= inputs[0].height) { return; }
+
+    S state[T] = {0};
+
+    absorb_2d_state<S, T>(inputs, state, number_of_inputs, rate, idx, constants);
+
+    UNROLL
+    for (int i = 0; i < output_len; i++) {
+      output[idx * output_len + i] = state[i];
+    }
+  }
+
+  template <typename S, int T>
+  __global__ void compress_and_inject_kernel(
+    const Matrix<S>* matrices_to_inject,
+    unsigned int number_of_inputs,
+    const S* prev_layer,
+    S* next_layer,
+    unsigned int rate,
+    unsigned int digest_elements,
+    const Poseidon2Constants<S> constants)
+  {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    uint64_t number_of_rows = d_next_pow_of_two(matrices_to_inject[0].height);
+    if (idx >= number_of_rows) { return; }
+
+    size_t next_layer_len = matrices_to_inject[0].height;
+    S state_to_compress[T] = {S::zero()};
+
+    for (int i = 0; i < digest_elements * 2; i++) {
+      state_to_compress[i] = prev_layer[idx * 2 * digest_elements + i];
+    }
+    permute_state<S, T>(state_to_compress, constants);
+
+    S injected_state[T] = {S::zero()};
+    if (idx < next_layer_len) {
+      absorb_2d_state<S, T>(matrices_to_inject, injected_state, number_of_inputs, rate, idx, constants);
+
+      for (int i = 0; i < digest_elements; i++) {
+        injected_state[digest_elements + i] = injected_state[i];
+        injected_state[i] = state_to_compress[i];
+      }
+    } else {
+      for (int i = 0; i < digest_elements; i++) {
+        injected_state[i] = state_to_compress[i];
+      }
+    }
+    permute_state<S, T>(injected_state, constants);
+
+    for (int i = 0; i < digest_elements; i++) {
+      next_layer[idx * digest_elements + i] = injected_state[i];
+    }
+  }
+} // namespace poseidon2
+
+#endif
--- a/icicle/include/poseidon2/poseidon2.cuh
+++ b/icicle/include/poseidon2/poseidon2.cuh
@@ -8,124 +8,172 @@
 #include "gpu-utils/error_handler.cuh"
 #include "utils/utils.h"

+#include "hash/hash.cuh"
+#include "matrix/matrix.cuh"
+
+#include "poseidon2/constants.cuh"
+#include "poseidon2/kernels.cuh"
+
+using matrix::Matrix;
+
 /**
 * @namespace poseidon2
 * Implementation of the [Poseidon2 hash function](https://eprint.iacr.org/2019/458.pdf)
 * Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
 */
 namespace poseidon2 {
-  /**
-   * For most of the Poseidon2 configurations this is the case
-   */
-  const int EXTERNAL_ROUNDS_DEFAULT = 8;
-
-  enum DiffusionStrategy {
-    DEFAULT_DIFFUSION,
-    MONTGOMERY,
-  };
-
-  enum MdsType { DEFAULT_MDS, PLONKY };
-
-  enum PoseidonMode {
-    COMPRESSION,
-    PERMUTATION,
-  };
-
-  /**
-   * @struct Poseidon2Constants
-   * This constants are enough to define a Poseidon2 instantce
-   * @param round_constants A pointer to round constants allocated on the device
-   * @param mds_matrix A pointer to an mds matrix allocated on the device
-   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
-   * @param sparse_matrices A pointer to sparse matrices allocated on the device
-   */
  template <typename S>
-  struct Poseidon2Constants {
-    int width;
-    int alpha;
-    int internal_rounds;
-    int external_rounds;
-    S* round_constants = nullptr;
-    S* internal_matrix_diag = nullptr;
-    MdsType mds_type;
-    DiffusionStrategy diffusion;
-  };
-
-  /**
-   * @struct Poseidon2Config
-   * Struct that encodes various Poseidon2 parameters.
-   */
-  struct Poseidon2Config {
-    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
-    bool are_states_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
-    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
-    PoseidonMode mode;
-    int output_index;
-    bool
-      is_async; /**< Whether to run the Poseidon2 asynchronously. If set to `true`, the poseidon_hash function will be
-                 *   non-blocking and you'd need to synchronize it explicitly by running
-                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
-                 *   function will block the current CPU thread. */
-  };
-
-  static Poseidon2Config default_poseidon2_config(
-    int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  class Poseidon2 : public hash::SpongeHasher<S, S>
  {
-    Poseidon2Config config = {
-      ctx,   // ctx
-      false, // are_states_on_device
-      false, // are_outputs_on_device
-      PoseidonMode::COMPRESSION,
-      1,     // output_index
-      false, // is_async
-    };
-    return config;
-  }
+    static const int POSEIDON_BLOCK_SIZE = 32;

-  template <typename S>
-  cudaError_t create_poseidon2_constants(
-    int width,
-    int alpha,
-    int internal_rounds,
-    int external_rounds,
-    const S* round_constants,
-    const S* internal_matrix_diag,
-    MdsType mds_type,
-    DiffusionStrategy diffusion,
-    device_context::DeviceContext& ctx,
-    Poseidon2Constants<S>* poseidon_constants);
+    static inline int poseidon_number_of_blocks(size_t number_of_states)
+    {
+      return number_of_states / POSEIDON_BLOCK_SIZE + static_cast<bool>(number_of_states % POSEIDON_BLOCK_SIZE);
+    }

-  /**
-   * Loads pre-calculated optimized constants, moves them to the device
-   */
-  template <typename S>
-  cudaError_t init_poseidon2_constants(
-    int width,
-    MdsType mds_type,
-    DiffusionStrategy diffusion,
-    device_context::DeviceContext& ctx,
-    Poseidon2Constants<S>* constants);
+  public:
+    const std::size_t device_id;
+    Poseidon2Constants<S> constants;

-  template <typename S>
-  cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx);
+    cudaError_t hash_2d(
+      const Matrix<S>* inputs,
+      S* output,
+      unsigned int number_of_inputs,
+      unsigned int output_len,
+      uint64_t number_of_rows,
+      const device_context::DeviceContext& ctx) const override
+    {
+#define P2_HASH_2D_T(width)                                                                                            \
+  case width:                                                                                                          \
+    hash_2d_kernel<S, width><<<poseidon_number_of_blocks(number_of_rows), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>(       \
+      inputs, output, number_of_inputs, this->rate, output_len, this->constants);                                      \
+    break;
+
+      switch (this->width) {
+        P2_HASH_2D_T(2)
+        P2_HASH_2D_T(3)
+        P2_HASH_2D_T(4)
+        P2_HASH_2D_T(8)
+        P2_HASH_2D_T(12)
+        P2_HASH_2D_T(16)
+        P2_HASH_2D_T(20)
+        P2_HASH_2D_T(24)
+      default:
+        THROW_ICICLE_ERR(
+          IcicleError_t::InvalidArgument, "PoseidonAbsorb2d: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+      }
+
+      CHK_IF_RETURN(cudaPeekAtLastError());
+      return CHK_LAST();
+    }
+
+    cudaError_t run_hash_many_kernel(
+      const S* input,
+      S* output,
+      unsigned int number_of_states,
+      unsigned int input_len,
+      unsigned int output_len,
+      const device_context::DeviceContext& ctx) const override
+    {
+#define P2_HASH_MANY_T(width)                                                                                          \
+  case width:                                                                                                          \
+    hash_many_kernel<S, width><<<poseidon_number_of_blocks(number_of_states), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>(   \
+      input, output, number_of_states, input_len, output_len, this->constants);                                        \
+    break;
+
+      switch (this->width) {
+        P2_HASH_MANY_T(2)
+        P2_HASH_MANY_T(3)
+        P2_HASH_MANY_T(4)
+        P2_HASH_MANY_T(8)
+        P2_HASH_MANY_T(12)
+        P2_HASH_MANY_T(16)
+        P2_HASH_MANY_T(20)
+        P2_HASH_MANY_T(24)
+      default:
+        THROW_ICICLE_ERR(
+          IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+      }
+      CHK_IF_RETURN(cudaPeekAtLastError());
+      return CHK_LAST();
+    }
+
+    cudaError_t compress_and_inject(
+      const Matrix<S>* matrices_to_inject,
+      unsigned int number_of_inputs,
+      uint64_t number_of_rows,
+      const S* prev_layer,
+      S* next_layer,
+      unsigned int digest_elements,
+      const device_context::DeviceContext& ctx) const override
+    {
+#define P2_COMPRESS_AND_INJECT_T(width)                                                                                \
+  case width:                                                                                                          \
+    compress_and_inject_kernel<S, width>                                                                               \
+      <<<poseidon_number_of_blocks(number_of_rows), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>(                             \
+        matrices_to_inject, number_of_inputs, prev_layer, next_layer, this->rate, digest_elements, this->constants);   \
+    break;
+
+      switch (this->width) {
+        P2_COMPRESS_AND_INJECT_T(2)
+        P2_COMPRESS_AND_INJECT_T(3)
+        P2_COMPRESS_AND_INJECT_T(4)
+        P2_COMPRESS_AND_INJECT_T(8)
+        P2_COMPRESS_AND_INJECT_T(12)
+        P2_COMPRESS_AND_INJECT_T(16)
+        P2_COMPRESS_AND_INJECT_T(20)
+        P2_COMPRESS_AND_INJECT_T(24)
+      default:
+        THROW_ICICLE_ERR(
+          IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+      }
+
+      CHK_IF_RETURN(cudaPeekAtLastError());
+      return CHK_LAST();
+    }
+
+    Poseidon2(
+      unsigned int width,
+      unsigned int rate,
+      unsigned int alpha,
+      unsigned int internal_rounds,
+      unsigned int external_rounds,
+      const S* round_constants,
+      const S* internal_matrix_diag,
+      MdsType mds_type,
+      DiffusionStrategy diffusion,
+      device_context::DeviceContext& ctx)
+        : hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
+    {
+      Poseidon2Constants<S> constants;
+      CHK_STICKY(create_poseidon2_constants(
+        width, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type, diffusion, ctx,
+        &constants));
+      this->constants = constants;
+    }
+
+    Poseidon2(
+      unsigned int width,
+      unsigned int rate,
+      MdsType mds_type,
+      DiffusionStrategy diffusion,
+      device_context::DeviceContext& ctx)
+        : hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
+    {
+      Poseidon2Constants<S> constants;
+      CHK_STICKY(init_poseidon2_constants(width, mds_type, diffusion, ctx, &constants));
+      this->constants = constants;
+    }
+
+    ~Poseidon2()
+    {
+      auto ctx = device_context::get_default_device_context();
+      ctx.device_id = this->device_id;
+      CHK_STICKY(release_poseidon2_constants<S>(&this->constants, ctx));
+    }
+  };

-  /**
-   * Compute the poseidon hash over a sequence of preimages.
-   * Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
-   * @param T size of the poseidon state, should be equal to {arity + 1}
-   * @param states a pointer to the input data. May be allocated on device or on host, regulated
-   * by the config. May point to a string of preimages or a string of states filled with preimages.
-   * @param output a pointer to the output data. May be allocated on device or on host, regulated
-   * by the config. Must be at least of size [number_of_states](@ref number_of_states)
-   * @param number_of_states number of input blocks of size T-1 (arity)
-   */
-  template <typename S, int T>
-  cudaError_t poseidon2_hash(
-    const S* states,
-    S* output,
-    size_t number_of_states,
-    const Poseidon2Constants<S>& constants,
-    const Poseidon2Config& config);
 } // namespace poseidon2

 #endif
--- a/icicle/include/utils/utils.h
+++ b/icicle/include/utils/utils.h
@@ -5,4 +5,15 @@
 #define CONCAT_DIRECT(a, b) a##_##b
 #define CONCAT_EXPAND(a, b) CONCAT_DIRECT(a, b) // expand a,b before concatenation

+static unsigned int next_pow_of_two(unsigned int v) {
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+}
+
 #endif // ICICLE_UTILS_H
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -105,12 +105,12 @@ namespace vec_ops {
   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
   */
  template <typename E>
-  cudaError_t transpose_batch(
+  cudaError_t transpose_matrix(
    const E* mat_in,
    E* mat_out,
    uint32_t row_size,
    uint32_t column_size,
-    device_context::DeviceContext& ctx,
+    const device_context::DeviceContext& ctx,
    bool on_device,
    bool is_async);

--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -11,6 +11,9 @@ set(SRC ${CMAKE_SOURCE_DIR}/src)

 set(FIELD_SOURCE ${SRC}/fields/extern.cu)
 list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
+list(APPEND FIELD_SOURCE ${SRC}/merkle-tree/extern.cu)
+list(APPEND FIELD_SOURCE ${SRC}/merkle-tree/extern_mmcs.cu)
+
 if(EXT_FIELD)
  list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
  if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
@@ -27,8 +30,6 @@ set(POLYNOMIAL_SOURCE_FILES
 # TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
 if (DEFINED CURVE)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/extern.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
-  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
 endif()

 if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_POSEIDON2)
--- a/icicle/src/hash/CMakeLists.txt
+++ b/icicle/src/hash/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET icicle_hash)

-add_library(${TARGET} STATIC keccak/keccak.cu)
+add_library(${TARGET} STATIC keccak/extern.cu)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
--- a/icicle/src/hash/keccak/.gitignore
+++ b/icicle/src/hash/keccak/.gitignore
@@ -0,0 +1 @@
+test_keccak
--- a/icicle/src/hash/keccak/Makefile
+++ b/icicle/src/hash/keccak/Makefile
@@ -1,2 +1,6 @@
 test_keccak: test.cu keccak.cu 
-	nvcc -o test_keccak -I. -I../.. test.cu
+	nvcc -o test_keccak -I../../../include test.cu
+	./test_keccak
+
+clear:
+	rm test_keccak
--- a/icicle/src/hash/keccak/extern.cu
+++ b/icicle/src/hash/keccak/extern.cu
@@ -0,0 +1,20 @@
+#include "utils/utils.h"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+#include "hash/keccak/keccak.cuh"
+#include "keccak.cu"
+
+namespace keccak {
+  extern "C" cudaError_t
+  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  {
+    return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
+  }
+
+  extern "C" cudaError_t
+  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
+  {
+    return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
+  }
+} // namespace keccak
--- a/icicle/src/hash/keccak/keccak.cu
+++ b/icicle/src/hash/keccak/keccak.cu
@@ -1,227 +1,14 @@
+#include <cstdint>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
 #include "hash/keccak/keccak.cuh"
+#include "kernels.cu"
+
+using namespace hash;

 namespace keccak {
-#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
-
-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)                                                              \
-  {                                                                                                                    \
-    t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4);                                                \
-  }
-
-#define THETA(                                                                                                         \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14);                                                      \
-    TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24);                                                      \
-    TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34);                                                      \
-    TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44);                                                      \
-    TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04);                                                      \
-    s00 ^= t0;                                                                                                         \
-    s01 ^= t0;                                                                                                         \
-    s02 ^= t0;                                                                                                         \
-    s03 ^= t0;                                                                                                         \
-    s04 ^= t0;                                                                                                         \
-                                                                                                                       \
-    s10 ^= t1;                                                                                                         \
-    s11 ^= t1;                                                                                                         \
-    s12 ^= t1;                                                                                                         \
-    s13 ^= t1;                                                                                                         \
-    s14 ^= t1;                                                                                                         \
-                                                                                                                       \
-    s20 ^= t2;                                                                                                         \
-    s21 ^= t2;                                                                                                         \
-    s22 ^= t2;                                                                                                         \
-    s23 ^= t2;                                                                                                         \
-    s24 ^= t2;                                                                                                         \
-                                                                                                                       \
-    s30 ^= t3;                                                                                                         \
-    s31 ^= t3;                                                                                                         \
-    s32 ^= t3;                                                                                                         \
-    s33 ^= t3;                                                                                                         \
-    s34 ^= t3;                                                                                                         \
-                                                                                                                       \
-    s40 ^= t4;                                                                                                         \
-    s41 ^= t4;                                                                                                         \
-    s42 ^= t4;                                                                                                         \
-    s43 ^= t4;                                                                                                         \
-    s44 ^= t4;                                                                                                         \
-  }
-
-#define RHOPI(                                                                                                         \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    t0 = ROTL64(s10, (uint64_t)1);                                                                                     \
-    s10 = ROTL64(s11, (uint64_t)44);                                                                                   \
-    s11 = ROTL64(s41, (uint64_t)20);                                                                                   \
-    s41 = ROTL64(s24, (uint64_t)61);                                                                                   \
-    s24 = ROTL64(s42, (uint64_t)39);                                                                                   \
-    s42 = ROTL64(s04, (uint64_t)18);                                                                                   \
-    s04 = ROTL64(s20, (uint64_t)62);                                                                                   \
-    s20 = ROTL64(s22, (uint64_t)43);                                                                                   \
-    s22 = ROTL64(s32, (uint64_t)25);                                                                                   \
-    s32 = ROTL64(s43, (uint64_t)8);                                                                                    \
-    s43 = ROTL64(s34, (uint64_t)56);                                                                                   \
-    s34 = ROTL64(s03, (uint64_t)41);                                                                                   \
-    s03 = ROTL64(s40, (uint64_t)27);                                                                                   \
-    s40 = ROTL64(s44, (uint64_t)14);                                                                                   \
-    s44 = ROTL64(s14, (uint64_t)2);                                                                                    \
-    s14 = ROTL64(s31, (uint64_t)55);                                                                                   \
-    s31 = ROTL64(s13, (uint64_t)45);                                                                                   \
-    s13 = ROTL64(s01, (uint64_t)36);                                                                                   \
-    s01 = ROTL64(s30, (uint64_t)28);                                                                                   \
-    s30 = ROTL64(s33, (uint64_t)21);                                                                                   \
-    s33 = ROTL64(s23, (uint64_t)15);                                                                                   \
-    s23 = ROTL64(s12, (uint64_t)10);                                                                                   \
-    s12 = ROTL64(s21, (uint64_t)6);                                                                                    \
-    s21 = ROTL64(s02, (uint64_t)3);                                                                                    \
-    s02 = t0;                                                                                                          \
-  }
-
-#define KHI(                                                                                                           \
-  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
-  s43, s44)                                                                                                            \
-  {                                                                                                                    \
-    t0 = s00 ^ (~s10 & s20);                                                                                           \
-    t1 = s10 ^ (~s20 & s30);                                                                                           \
-    t2 = s20 ^ (~s30 & s40);                                                                                           \
-    t3 = s30 ^ (~s40 & s00);                                                                                           \
-    t4 = s40 ^ (~s00 & s10);                                                                                           \
-    s00 = t0;                                                                                                          \
-    s10 = t1;                                                                                                          \
-    s20 = t2;                                                                                                          \
-    s30 = t3;                                                                                                          \
-    s40 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s01 ^ (~s11 & s21);                                                                                           \
-    t1 = s11 ^ (~s21 & s31);                                                                                           \
-    t2 = s21 ^ (~s31 & s41);                                                                                           \
-    t3 = s31 ^ (~s41 & s01);                                                                                           \
-    t4 = s41 ^ (~s01 & s11);                                                                                           \
-    s01 = t0;                                                                                                          \
-    s11 = t1;                                                                                                          \
-    s21 = t2;                                                                                                          \
-    s31 = t3;                                                                                                          \
-    s41 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s02 ^ (~s12 & s22);                                                                                           \
-    t1 = s12 ^ (~s22 & s32);                                                                                           \
-    t2 = s22 ^ (~s32 & s42);                                                                                           \
-    t3 = s32 ^ (~s42 & s02);                                                                                           \
-    t4 = s42 ^ (~s02 & s12);                                                                                           \
-    s02 = t0;                                                                                                          \
-    s12 = t1;                                                                                                          \
-    s22 = t2;                                                                                                          \
-    s32 = t3;                                                                                                          \
-    s42 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s03 ^ (~s13 & s23);                                                                                           \
-    t1 = s13 ^ (~s23 & s33);                                                                                           \
-    t2 = s23 ^ (~s33 & s43);                                                                                           \
-    t3 = s33 ^ (~s43 & s03);                                                                                           \
-    t4 = s43 ^ (~s03 & s13);                                                                                           \
-    s03 = t0;                                                                                                          \
-    s13 = t1;                                                                                                          \
-    s23 = t2;                                                                                                          \
-    s33 = t3;                                                                                                          \
-    s43 = t4;                                                                                                          \
-                                                                                                                       \
-    t0 = s04 ^ (~s14 & s24);                                                                                           \
-    t1 = s14 ^ (~s24 & s34);                                                                                           \
-    t2 = s24 ^ (~s34 & s44);                                                                                           \
-    t3 = s34 ^ (~s44 & s04);                                                                                           \
-    t4 = s44 ^ (~s04 & s14);                                                                                           \
-    s04 = t0;                                                                                                          \
-    s14 = t1;                                                                                                          \
-    s24 = t2;                                                                                                          \
-    s34 = t3;                                                                                                          \
-    s44 = t4;                                                                                                          \
-  }
-
-#define IOTA(element, rc)                                                                                              \
-  {                                                                                                                    \
-    element ^= rc;                                                                                                     \
-  }
-
-  __device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
-                                      0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
-                                      0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-                                      0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
-                                      0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
-                                      0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
-
-  __device__ void keccakf(uint64_t s[25])
-  {
-    uint64_t t0, t1, t2, t3, t4;
-
-    for (int i = 0; i < 24; i++) {
-      THETA(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      RHOPI(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      KHI(
-        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
-        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
-      IOTA(s[0], RC[i]);
-    }
-  }
-
-  template <int C, int D>
-  __global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
-  {
-    int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (bid >= number_of_blocks) { return; }
-
-    const int r_bits = 1600 - C;
-    const int r_bytes = r_bits / 8;
-    const int d_bytes = D / 8;
-
-    uint8_t* b_input = input + bid * input_block_size;
-    uint8_t* b_output = output + bid * d_bytes;
-    uint64_t state[25] = {}; // Initialize with zeroes
-
-    int input_len = input_block_size;
-
-    // absorb
-    while (input_len >= r_bytes) {
-      // #pragma unroll
-      for (int i = 0; i < r_bytes; i += 8) {
-        state[i / 8] ^= *(uint64_t*)(b_input + i);
-      }
-      keccakf(state);
-      b_input += r_bytes;
-      input_len -= r_bytes;
-    }
-
-    // last block (if any)
-    uint8_t last_block[r_bytes];
-    for (int i = 0; i < input_len; i++) {
-      last_block[i] = b_input[i];
-    }
-
-    // pad 10*1
-    last_block[input_len] = 1;
-    for (int i = 0; i < r_bytes - input_len - 1; i++) {
-      last_block[input_len + i + 1] = 0;
-    }
-    // last bit
-    last_block[r_bytes - 1] |= 0x80;
-
-    // #pragma unroll
-    for (int i = 0; i < r_bytes; i += 8) {
-      state[i / 8] ^= *(uint64_t*)(last_block + i);
-    }
-    keccakf(state);
-
-#pragma unroll
-    for (int i = 0; i < d_bytes; i += 8) {
-      *(uint64_t*)(b_output + i) = state[i / 8];
-    }
-  }
-
  template <int C, int D>
  cudaError_t
  keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
@@ -260,16 +47,4 @@ namespace keccak {
    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
    return CHK_LAST();
  }
-
-  extern "C" cudaError_t
-  keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
-  {
-    return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
-  }
-
-  extern "C" cudaError_t
-  keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
-  {
-    return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
-  }
 } // namespace keccak
--- a/icicle/src/hash/keccak/kernels.cu
+++ b/icicle/src/hash/keccak/kernels.cu
@@ -0,0 +1,233 @@
+#pragma once
+#ifndef KECCAK_KERNELS_H
+#define KECCAK_KERNELS_H
+
+#include <cstdint>
+#include "gpu-utils/modifiers.cuh"
+
+namespace keccak {
+  using u64 = uint64_t;
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)                                                              \
+  {                                                                                                                    \
+    t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4);                                                \
+  }
+
+#define THETA(                                                                                                         \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14);                                                      \
+    TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24);                                                      \
+    TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34);                                                      \
+    TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44);                                                      \
+    TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04);                                                      \
+    s00 ^= t0;                                                                                                         \
+    s01 ^= t0;                                                                                                         \
+    s02 ^= t0;                                                                                                         \
+    s03 ^= t0;                                                                                                         \
+    s04 ^= t0;                                                                                                         \
+                                                                                                                       \
+    s10 ^= t1;                                                                                                         \
+    s11 ^= t1;                                                                                                         \
+    s12 ^= t1;                                                                                                         \
+    s13 ^= t1;                                                                                                         \
+    s14 ^= t1;                                                                                                         \
+                                                                                                                       \
+    s20 ^= t2;                                                                                                         \
+    s21 ^= t2;                                                                                                         \
+    s22 ^= t2;                                                                                                         \
+    s23 ^= t2;                                                                                                         \
+    s24 ^= t2;                                                                                                         \
+                                                                                                                       \
+    s30 ^= t3;                                                                                                         \
+    s31 ^= t3;                                                                                                         \
+    s32 ^= t3;                                                                                                         \
+    s33 ^= t3;                                                                                                         \
+    s34 ^= t3;                                                                                                         \
+                                                                                                                       \
+    s40 ^= t4;                                                                                                         \
+    s41 ^= t4;                                                                                                         \
+    s42 ^= t4;                                                                                                         \
+    s43 ^= t4;                                                                                                         \
+    s44 ^= t4;                                                                                                         \
+  }
+
+#define RHOPI(                                                                                                         \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    t0 = ROTL64(s10, (uint64_t)1);                                                                                     \
+    s10 = ROTL64(s11, (uint64_t)44);                                                                                   \
+    s11 = ROTL64(s41, (uint64_t)20);                                                                                   \
+    s41 = ROTL64(s24, (uint64_t)61);                                                                                   \
+    s24 = ROTL64(s42, (uint64_t)39);                                                                                   \
+    s42 = ROTL64(s04, (uint64_t)18);                                                                                   \
+    s04 = ROTL64(s20, (uint64_t)62);                                                                                   \
+    s20 = ROTL64(s22, (uint64_t)43);                                                                                   \
+    s22 = ROTL64(s32, (uint64_t)25);                                                                                   \
+    s32 = ROTL64(s43, (uint64_t)8);                                                                                    \
+    s43 = ROTL64(s34, (uint64_t)56);                                                                                   \
+    s34 = ROTL64(s03, (uint64_t)41);                                                                                   \
+    s03 = ROTL64(s40, (uint64_t)27);                                                                                   \
+    s40 = ROTL64(s44, (uint64_t)14);                                                                                   \
+    s44 = ROTL64(s14, (uint64_t)2);                                                                                    \
+    s14 = ROTL64(s31, (uint64_t)55);                                                                                   \
+    s31 = ROTL64(s13, (uint64_t)45);                                                                                   \
+    s13 = ROTL64(s01, (uint64_t)36);                                                                                   \
+    s01 = ROTL64(s30, (uint64_t)28);                                                                                   \
+    s30 = ROTL64(s33, (uint64_t)21);                                                                                   \
+    s33 = ROTL64(s23, (uint64_t)15);                                                                                   \
+    s23 = ROTL64(s12, (uint64_t)10);                                                                                   \
+    s12 = ROTL64(s21, (uint64_t)6);                                                                                    \
+    s21 = ROTL64(s02, (uint64_t)3);                                                                                    \
+    s02 = t0;                                                                                                          \
+  }
+
+#define KHI(                                                                                                           \
+  s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42,   \
+  s43, s44)                                                                                                            \
+  {                                                                                                                    \
+    t0 = s00 ^ (~s10 & s20);                                                                                           \
+    t1 = s10 ^ (~s20 & s30);                                                                                           \
+    t2 = s20 ^ (~s30 & s40);                                                                                           \
+    t3 = s30 ^ (~s40 & s00);                                                                                           \
+    t4 = s40 ^ (~s00 & s10);                                                                                           \
+    s00 = t0;                                                                                                          \
+    s10 = t1;                                                                                                          \
+    s20 = t2;                                                                                                          \
+    s30 = t3;                                                                                                          \
+    s40 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s01 ^ (~s11 & s21);                                                                                           \
+    t1 = s11 ^ (~s21 & s31);                                                                                           \
+    t2 = s21 ^ (~s31 & s41);                                                                                           \
+    t3 = s31 ^ (~s41 & s01);                                                                                           \
+    t4 = s41 ^ (~s01 & s11);                                                                                           \
+    s01 = t0;                                                                                                          \
+    s11 = t1;                                                                                                          \
+    s21 = t2;                                                                                                          \
+    s31 = t3;                                                                                                          \
+    s41 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s02 ^ (~s12 & s22);                                                                                           \
+    t1 = s12 ^ (~s22 & s32);                                                                                           \
+    t2 = s22 ^ (~s32 & s42);                                                                                           \
+    t3 = s32 ^ (~s42 & s02);                                                                                           \
+    t4 = s42 ^ (~s02 & s12);                                                                                           \
+    s02 = t0;                                                                                                          \
+    s12 = t1;                                                                                                          \
+    s22 = t2;                                                                                                          \
+    s32 = t3;                                                                                                          \
+    s42 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s03 ^ (~s13 & s23);                                                                                           \
+    t1 = s13 ^ (~s23 & s33);                                                                                           \
+    t2 = s23 ^ (~s33 & s43);                                                                                           \
+    t3 = s33 ^ (~s43 & s03);                                                                                           \
+    t4 = s43 ^ (~s03 & s13);                                                                                           \
+    s03 = t0;                                                                                                          \
+    s13 = t1;                                                                                                          \
+    s23 = t2;                                                                                                          \
+    s33 = t3;                                                                                                          \
+    s43 = t4;                                                                                                          \
+                                                                                                                       \
+    t0 = s04 ^ (~s14 & s24);                                                                                           \
+    t1 = s14 ^ (~s24 & s34);                                                                                           \
+    t2 = s24 ^ (~s34 & s44);                                                                                           \
+    t3 = s34 ^ (~s44 & s04);                                                                                           \
+    t4 = s44 ^ (~s04 & s14);                                                                                           \
+    s04 = t0;                                                                                                          \
+    s14 = t1;                                                                                                          \
+    s24 = t2;                                                                                                          \
+    s34 = t3;                                                                                                          \
+    s44 = t4;                                                                                                          \
+  }
+
+#define IOTA(element, rc)                                                                                              \
+  {                                                                                                                    \
+    element ^= rc;                                                                                                     \
+  }
+
+  __device__ const u64 RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
+                                 0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
+                                 0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+                                 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
+                                 0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
+                                 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
+
+  __device__ void keccakf(u64 s[25])
+  {
+    u64 t0, t1, t2, t3, t4;
+
+    for (int i = 0; i < 24; i++) {
+      THETA(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      RHOPI(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      KHI(
+        s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
+        s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
+      IOTA(s[0], RC[i]);
+    }
+  }
+
+  template <int C, int D>
+  __global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
+  {
+    int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (bid >= number_of_blocks) { return; }
+
+    const int r_bits = 1600 - C;
+    const int r_bytes = r_bits / 8;
+    const int d_bytes = D / 8;
+
+    uint8_t* b_input = input + bid * input_block_size;
+    uint8_t* b_output = output + bid * d_bytes;
+    uint64_t state[25] = {}; // Initialize with zeroes
+
+    int input_len = input_block_size;
+
+    // absorb
+    while (input_len >= r_bytes) {
+      // #pragma unroll
+      for (int i = 0; i < r_bytes; i += 8) {
+        state[i / 8] ^= *(uint64_t*)(b_input + i);
+      }
+      keccakf(state);
+      b_input += r_bytes;
+      input_len -= r_bytes;
+    }
+
+    // last block (if any)
+    uint8_t last_block[r_bytes];
+    for (int i = 0; i < input_len; i++) {
+      last_block[i] = b_input[i];
+    }
+
+    // pad 10*1
+    last_block[input_len] = 1;
+    for (int i = 0; i < r_bytes - input_len - 1; i++) {
+      last_block[input_len + i + 1] = 0;
+    }
+    // last bit
+    last_block[r_bytes - 1] |= 0x80;
+
+    // #pragma unroll
+    for (int i = 0; i < r_bytes; i += 8) {
+      state[i / 8] ^= *(uint64_t*)(last_block + i);
+    }
+    keccakf(state);
+
+#pragma unroll
+    for (int i = 0; i < d_bytes; i += 8) {
+      *(uint64_t*)(b_output + i) = state[i / 8];
+    }
+  }
+} // namespace keccak
+
+#endif
--- a/icicle/src/hash/keccak/test.cu
+++ b/icicle/src/hash/keccak/test.cu
@@ -1,5 +1,5 @@
 #include "gpu-utils/device_context.cuh"
-#include "keccak.cu"
+#include "extern.cu"

 // #define DEBUG

--- a/icicle/src/hash/keccak/test_keccak
+++ b/icicle/src/hash/keccak/test_keccak
--- a/icicle/src/poseidon/tree/.gitignore
+++ b/icicle/src/poseidon/tree/.gitignore
--- a/icicle/src/merkle-tree/extern.cu
+++ b/icicle/src/merkle-tree/extern.cu
@@ -0,0 +1,25 @@
+#include "utils/utils.h"
+
+#include "gpu-utils/error_handler.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "merkle.cu"
+
+#include "hash/hash.cuh"
+
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+namespace merkle_tree {
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, build_merkle_tree)(
+    const scalar_t* leaves_digests,
+    scalar_t* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const hash::SpongeHasher<scalar_t, scalar_t>* compression,
+    const hash::SpongeHasher<scalar_t, scalar_t>* bottom_layer,
+    const TreeBuilderConfig& tree_config)
+  {
+    return build_merkle_tree<scalar_t, scalar_t>(
+      leaves_digests, digests, height, input_block_len, *compression, *bottom_layer, tree_config);
+  }
+} // namespace merkle_tree
--- a/icicle/src/merkle-tree/extern_mmcs.cu
+++ b/icicle/src/merkle-tree/extern_mmcs.cu
@@ -0,0 +1,26 @@
+#include "utils/utils.h"
+
+#include "gpu-utils/error_handler.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
+#include "mmcs.cu"
+
+#include "hash/hash.cuh"
+
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+using matrix::Matrix;
+
+namespace merkle_tree {
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, mmcs_commit_cuda)(
+    const Matrix<scalar_t>* leaves,
+    unsigned int number_of_inputs,
+    scalar_t* digests,
+    const hash::SpongeHasher<scalar_t, scalar_t>* hasher,
+    const hash::SpongeHasher<scalar_t, scalar_t>* compression,
+    const TreeBuilderConfig& tree_config)
+  {
+    return mmcs_commit<scalar_t, scalar_t>(leaves, number_of_inputs, digests, *hasher, *compression, tree_config);
+  }
+} // namespace merkle_tree
--- a/icicle/src/merkle-tree/merkle.cu
+++ b/icicle/src/merkle-tree/merkle.cu
@@ -0,0 +1,336 @@
+#include "hash/hash.cuh"
+#include "merkle-tree/merkle.cuh"
+
+namespace merkle_tree {
+  /// Constructs merkle subtree without parallelization
+  /// The digests are aligned sequentially per row
+  /// Example:
+  ///
+  /// Big tree:
+  ///
+  ///        1      <- Root
+  ///       / \     <- Arity = 2
+  ///      2   3    <- Digests
+  ///     / \ / \   <- Height = 2 (as the number of edges)
+  ///    4  5 6  7  <- height^arity leaves
+  ///    |  | |  |  <- Bottom layer hash 1 to 1
+  ///    a  b c  d  <- Input vector 1x4
+  ///
+  /// Subtree 1    Subtree 2
+  ///    2            3
+  ///   / \          / \
+  ///  4   5        6   7
+  ///
+  /// Digests array for subtree 1:
+  /// [4 5 . . 2 . .]
+  /// |   |    |
+  /// -----    V
+  ///   |    Segment (offset = 4, subtree_idx = 0)
+  ///   v
+  /// Segment (offset = 0, subtree_idx = 0)
+  ///
+  /// Digests array for subtree 2:
+  /// [. . 6 7 . 3 .]
+  ///     |   |
+  ///     -----
+  ///       |
+  ///       v
+  ///    Segment (offset = 0, subtree_idx = 1)
+  ///
+  /// Total digests array:
+  /// [4 5 6 7 2 3 .]
+  ///
+  /// Example for custom config:
+  ///
+  /// arity = 2
+  /// input_block_len = 2
+  /// digest_elements = 2
+  /// bottom_layer hash width = 4
+  /// compression width = 4
+  /// height = 2
+  ///
+  ///                    [a, b]    <- Root of the tree
+  ///                     |  |
+  ///                    [a, b, c, d]
+  ///                     /  \  /  \ 
+  ///                    [i, j, m, n]
+  ///           ┌──┬──────┴──┴──┴──┴──────┬──┐
+  ///           |  |                      |  |
+  ///          [i, j, k, l]              [m, n, o, p]       <- compression states
+  ///           /  \  /  \                /  \  /  \        <- Running permutation
+  ///          [1, 2, 5, 6]              [9, 1, 4, 5]       <- compression states
+  ///    ┌──┬───┴──┴──┼──┤         ┌──┬───┴──┴──┼──┤
+  ///    |  |         |  |         |  |         |  |        <- digest_element * height^arity leaves
+  ///   [1, 2, 3, 4] [5, 6, 7, 8] [9, 1, 2, 3] [4, 5, 6, 7] <- Permuted states
+  ///    /  \  /  \   /  \  /  \   /  \  /  \   /  \  /  \  <- Running permutation
+  ///   [a, b, 0, 0] [c, d, 0, 0] [e, f, 0, 0] [g, h, 0, 0] <- States of the bottom layer hash
+  ///    |  |         |  |         |  |         |  |        <- Bottom layer hash 2 to 2
+  ///    a  b         c  d         e  f         g  h        <- Input vector 2x4
+  ///
+  /// Input matrix:
+  ///   ┌     ┐
+  ///   | a b |
+  ///   | c d |
+  ///   | e f |
+  ///   | g h |
+  ///   └     ┘
+
+  template <typename L, typename D>
+  cudaError_t build_merkle_subtree(
+    const L* leaves,
+    D* states,
+    D* digests,
+    size_t subtree_idx,
+    size_t subtree_height,
+    L* big_tree_digests,
+    size_t start_segment_size,
+    size_t start_segment_offset,
+    uint64_t keep_rows,
+    uint64_t input_block_len,
+    const SpongeHasher<L, D>& bottom_layer,
+    const SpongeHasher<L, D>& compression,
+    const TreeBuilderConfig& tree_config,
+    device_context::DeviceContext& ctx)
+  {
+    uint64_t arity = tree_config.arity;
+
+    SpongeConfig sponge_config = default_sponge_config(ctx);
+    sponge_config.are_inputs_on_device = true;
+    sponge_config.are_outputs_on_device = true;
+    sponge_config.is_async = true;
+
+    size_t bottom_layer_states = pow(arity, subtree_height);
+
+    if (!tree_config.are_inputs_on_device) {
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        states, leaves, bottom_layer_states * input_block_len * sizeof(L), cudaMemcpyHostToDevice, ctx.stream));
+    }
+
+    bottom_layer.hash_many(
+      tree_config.are_inputs_on_device ? leaves : states, digests, bottom_layer_states, input_block_len,
+      tree_config.digest_elements, sponge_config);
+
+    uint64_t number_of_states = bottom_layer_states / arity;
+    size_t segment_size = start_segment_size;
+    size_t segment_offset = start_segment_offset;
+
+    if (!keep_rows || subtree_height < keep_rows) {
+      D* digests_with_offset = big_tree_digests + segment_offset + subtree_idx * bottom_layer_states;
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        digests_with_offset, digests, bottom_layer_states * tree_config.digest_elements * sizeof(D),
+        cudaMemcpyDeviceToHost, ctx.stream));
+      segment_offset += segment_size;
+    }
+    segment_size /= arity;
+    subtree_height--;
+    swap<D>(&digests, &states);
+
+    while (number_of_states > 0) {
+      CHK_IF_RETURN(
+        compression.compress_many(states, digests, number_of_states, tree_config.digest_elements, sponge_config));
+
+      if (!keep_rows || subtree_height < keep_rows) {
+        D* digests_with_offset =
+          big_tree_digests + segment_offset + subtree_idx * number_of_states * tree_config.digest_elements;
+        CHK_IF_RETURN(cudaMemcpyAsync(
+          digests_with_offset, digests, number_of_states * tree_config.digest_elements * sizeof(D),
+          cudaMemcpyDeviceToHost, ctx.stream));
+        segment_offset += segment_size;
+      }
+      if (number_of_states > 1) { swap<D>(&digests, &states); }
+      segment_size /= arity;
+      subtree_height--;
+      number_of_states /= arity;
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename L, typename D>
+  cudaError_t build_merkle_tree(
+    const L* leaves,
+    D* digests,
+    unsigned int height,
+    unsigned int input_block_len,
+    const SpongeHasher<L, D>& compression,
+    const SpongeHasher<L, D>& bottom_layer,
+    const TreeBuilderConfig& tree_config)
+  {
+    CHK_INIT_IF_RETURN();
+    cudaStream_t& stream = tree_config.ctx.stream;
+
+    if (input_block_len * sizeof(L) > bottom_layer.rate * sizeof(D))
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument,
+        "Sponge construction at the bottom of the tree doesn't support inputs bigger than hash rate");
+    if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument,
+        "Hash max preimage length does not match merkle tree arity multiplied by digest elements");
+
+    uint64_t number_of_bottom_layer_states = pow(tree_config.arity, height);
+
+    // This will determine how much splitting do we need to do
+    // `number_of_streams` subtrees should fit in the device
+    // This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
+    uint64_t number_of_subtrees = 1;
+    uint64_t subtree_height = height;
+    uint64_t subtree_bottom_layer_states = number_of_bottom_layer_states;
+    uint64_t subtree_states_size = subtree_bottom_layer_states * bottom_layer.width;
+
+    uint64_t subtree_digests_size;
+    if (compression.width != compression.preimage_max_length) {
+      // In that case, the states on layer 1 will require extending the states by (width / preimage_max_len) factor
+      subtree_digests_size =
+        subtree_states_size * bottom_layer.preimage_max_length / bottom_layer.width * tree_config.digest_elements;
+    } else {
+      subtree_digests_size = subtree_states_size / bottom_layer.width * tree_config.digest_elements;
+    }
+    size_t subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
+    while (subtree_memory_required > STREAM_CHUNK_SIZE) {
+      number_of_subtrees *= tree_config.arity;
+      subtree_height--;
+      subtree_bottom_layer_states /= tree_config.arity;
+      subtree_states_size /= tree_config.arity;
+      subtree_digests_size /= tree_config.arity;
+      subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
+    }
+    int cap_height = height - subtree_height;
+    size_t caps_len = pow(tree_config.arity, cap_height) * tree_config.digest_elements;
+
+    size_t available_memory, _total_memory;
+    CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
+    available_memory -= GIGA / 8; // Leave 128 MB just in case
+
+    // We can effectively parallelize memory copy with streams
+    // as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
+    const size_t number_of_streams = std::min((uint64_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
+    cudaStream_t* streams = static_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t) * number_of_streams));
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
+    }
+
+    bool caps_mode = tree_config.keep_rows && tree_config.keep_rows <= cap_height;
+    D* caps;
+    if (caps_mode) { caps = static_cast<D*>(malloc(caps_len * sizeof(D))); }
+
+#ifdef MERKLE_DEBUG
+    std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Number of streams = " << number_of_streams << std::endl;
+    std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
+    std::cout << "Height of a subtree = " << subtree_height << std::endl;
+    std::cout << "Cutoff height = " << height - subtree_height << std::endl;
+    std::cout << "Number of leaves in a subtree = " << subtree_bottom_layer_states << std::endl;
+    std::cout << "State of a subtree = " << subtree_states_size << std::endl;
+    std::cout << "Digest elements for a subtree = " << subtree_digests_size << std::endl;
+    std::cout << "Size of 1 subtree states = " << subtree_states_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Cap height = " << cap_height << std::endl;
+    std::cout << "Enabling caps mode? " << caps_mode << std::endl;
+#endif
+
+    // Allocate memory for the leaves and digests
+    // These are shared by streams in a pool
+    D *states_ptr, *digests_ptr;
+    CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_states_size * number_of_streams * sizeof(D), stream));
+    CHK_IF_RETURN(cudaMemsetAsync(states_ptr, 0, subtree_states_size * number_of_streams * sizeof(D), stream));
+    CHK_IF_RETURN(cudaMallocAsync(&digests_ptr, subtree_digests_size * number_of_streams * sizeof(D), stream));
+    // Wait for these allocations to finish
+    CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+    // Build subtrees in parallel. This for loop invokes kernels that can run in a pool of size `number_of_streams`
+    for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
+      size_t stream_idx = subtree_idx % number_of_streams;
+      cudaStream_t subtree_stream = streams[stream_idx];
+
+      const L* subtree_leaves = leaves + subtree_idx * subtree_bottom_layer_states * input_block_len;
+      D* subtree_state = states_ptr + stream_idx * subtree_states_size;
+      D* subtree_digests = digests_ptr + stream_idx * subtree_digests_size;
+
+      int subtree_keep_rows = 0;
+      if (tree_config.keep_rows) {
+        int diff = tree_config.keep_rows - cap_height;
+        subtree_keep_rows = std::max(1, diff);
+      }
+      device_context::DeviceContext subtree_context{subtree_stream, tree_config.ctx.device_id, tree_config.ctx.mempool};
+
+      uint64_t start_segment_size = number_of_bottom_layer_states * tree_config.digest_elements;
+      cudaError_t subtree_result = build_merkle_subtree<L, D>(
+        subtree_leaves,             // leaves
+        subtree_state,              // state
+        subtree_digests,            // digests
+        subtree_idx,                // subtree_idx
+        subtree_height,             // subtree_height
+        caps_mode ? caps : digests, // big_tree_digests
+        start_segment_size,         // start_segment_size
+        0,                          // start_segment_offset
+        subtree_keep_rows,          // keep_rows
+        input_block_len,            // input_block_len
+        bottom_layer,               // bottom_layer
+        compression,                // compression
+        tree_config,                // tree_config
+        subtree_context             // subtree_context
+      );
+      CHK_IF_RETURN(subtree_result);
+    }
+
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
+    }
+
+    SpongeConfig sponge_config = default_sponge_config(tree_config.ctx);
+    sponge_config.are_inputs_on_device = tree_config.are_inputs_on_device;
+    sponge_config.are_outputs_on_device = true;
+    sponge_config.is_async = true;
+    // Finish the top-level tree if any
+    if (cap_height > 0) {
+      size_t start_segment_size = caps_len / tree_config.arity;
+      size_t start_segment_offset = 0;
+      if (!caps_mode) { // Calculate offset
+        size_t keep_rows = tree_config.keep_rows ? tree_config.keep_rows : height + 1;
+        size_t layer_size = pow(tree_config.arity, keep_rows - 1) * tree_config.digest_elements;
+        for (int i = 0; i < keep_rows - cap_height; i++) {
+          start_segment_offset += layer_size;
+          layer_size /= tree_config.arity;
+        }
+      }
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        states_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
+        (caps_mode || !tree_config.are_outputs_on_device) ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, stream));
+
+      uint64_t number_of_states = caps_len / tree_config.arity / tree_config.digest_elements;
+
+      size_t segment_size = start_segment_size;
+      size_t segment_offset = start_segment_offset;
+      while (number_of_states > 0) {
+        CHK_IF_RETURN(compression.compress_many(
+          states_ptr, digests_ptr, number_of_states, tree_config.digest_elements, sponge_config));
+        if (!tree_config.keep_rows || cap_height < tree_config.keep_rows + (int)caps_mode) {
+          D* digests_with_offset = digests + segment_offset;
+          CHK_IF_RETURN(cudaMemcpyAsync(
+            digests_with_offset, digests_ptr, number_of_states * tree_config.digest_elements * sizeof(D),
+            cudaMemcpyDeviceToHost, stream));
+          segment_offset += segment_size;
+        }
+
+        if (number_of_states > 1) { swap<D>(&digests_ptr, &states_ptr); }
+
+        segment_size /= tree_config.arity;
+        cap_height--;
+        number_of_states /= tree_config.arity;
+      }
+      if (caps_mode) { free(caps); }
+    }
+
+    CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
+    CHK_IF_RETURN(cudaFreeAsync(digests_ptr, stream));
+    if (!tree_config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
+      CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
+    }
+    free(streams);
+    return CHK_LAST();
+  }
+
+} // namespace merkle_tree
--- a/icicle/src/merkle-tree/mmcs.cu
+++ b/icicle/src/merkle-tree/mmcs.cu
@@ -0,0 +1,456 @@
+#include "hash/hash.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "matrix/matrix.cuh"
+#include "vec_ops/vec_ops.cuh"
+
+#include <algorithm>
+
+using matrix::Matrix;
+
+namespace merkle_tree {
+
+  template <typename L, typename D>
+  cudaError_t hash_leaves(
+    const Matrix<L>* leaves,
+    unsigned int number_of_inputs,
+    uint64_t number_of_rows,
+    D* digests,
+    unsigned int digest_elements,
+    const SpongeHasher<L, D>& hasher,
+    const device_context::DeviceContext& ctx)
+  {
+    SpongeConfig sponge_config = default_sponge_config(ctx);
+    sponge_config.are_inputs_on_device = true;
+    sponge_config.are_outputs_on_device = true;
+    sponge_config.is_async = true;
+
+    uint64_t number_of_rows_padded = next_pow_of_two(number_of_rows);
+
+    CHK_IF_RETURN(hasher.hash_2d(leaves, digests, number_of_inputs, digest_elements, number_of_rows, ctx));
+
+    if (number_of_rows_padded - number_of_rows) {
+      // Pad with default digests
+      cudaMemsetAsync(
+        (void*)(digests + number_of_rows), 0, (number_of_rows_padded - number_of_rows) * digest_elements * sizeof(D),
+        ctx.stream);
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename L, typename D>
+  struct SubtreeParams {
+    unsigned int number_of_inputs; // Number of input matrices
+    unsigned int arity;            // Arity of the tree
+    unsigned int digest_elements;  // Number of output elements per hash
+    size_t number_of_rows;         // Current number of input rows to operate on
+    size_t number_of_rows_padded;  // next power of arity for number_of_rows
+    size_t subtree_idx;            // The subtree id
+    size_t number_of_subtrees;     // Total number of subtrees
+    uint64_t subtree_height;       // Height of one subtree
+
+    /// One segment corresponds to one layer of output digests
+    size_t segment_size;                     // The size of current segment.
+    size_t segment_offset;                   // An offset for the current segment
+    unsigned int leaves_offset;              // An offset in the sorted list of input matrices
+    unsigned int number_of_leaves_to_inject; // Number of leaves to inject in current level
+    unsigned int keep_rows;                  // Number of rows to keep
+    bool are_inputs_on_device;
+    bool caps_mode;
+    const SpongeHasher<L, D>* hasher = nullptr;
+    const SpongeHasher<L, D>* compression = nullptr;
+    const device_context::DeviceContext* ctx = nullptr;
+  };
+
+  template <typename L, typename D>
+  cudaError_t slice_and_copy_leaves(
+    const std::vector<Matrix<L>>& leaves, L* d_leaves, Matrix<L>* d_leaves_info, SubtreeParams<L, D>& params)
+  {
+    uint64_t target_height = params.number_of_rows_padded * params.number_of_subtrees;
+    params.number_of_leaves_to_inject = 0;
+    while (params.leaves_offset < params.number_of_inputs &&
+           next_pow_of_two(leaves[params.leaves_offset].height) >= target_height) {
+      if (next_pow_of_two(leaves[params.leaves_offset].height) == target_height) params.number_of_leaves_to_inject++;
+      params.leaves_offset++;
+    }
+
+    if (params.number_of_leaves_to_inject) {
+      size_t rows_offset = params.subtree_idx * params.number_of_rows_padded;
+      size_t actual_layer_rows = leaves[params.leaves_offset - params.number_of_leaves_to_inject].height;
+      params.number_of_rows = std::min(actual_layer_rows - rows_offset, params.number_of_rows_padded);
+
+      Matrix<L>* leaves_info = static_cast<Matrix<L>*>(malloc(params.number_of_leaves_to_inject * sizeof(Matrix<L>)));
+      L* d_leaves_ptr = d_leaves;
+      for (auto i = 0; i < params.number_of_leaves_to_inject; i++) {
+        Matrix<L> leaf = leaves[params.leaves_offset - params.number_of_leaves_to_inject + i];
+        if (!params.are_inputs_on_device) {
+          CHK_IF_RETURN(cudaMemcpyAsync(
+            d_leaves_ptr, leaf.values + rows_offset * leaf.width, params.number_of_rows * leaf.width * sizeof(L),
+            cudaMemcpyHostToDevice, params.ctx->stream));
+        } else {
+          d_leaves_ptr = leaf.values + rows_offset * leaf.width;
+        }
+
+        leaves_info[i] = {d_leaves_ptr, leaf.width, params.number_of_rows};
+        d_leaves_ptr += params.number_of_rows * leaf.width;
+      }
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        d_leaves_info, leaves_info, params.number_of_leaves_to_inject * sizeof(Matrix<L>), cudaMemcpyHostToDevice,
+        params.ctx->stream));
+      free(leaves_info);
+    }
+
+    return CHK_LAST();
+  }
+
+  /// Checks if the current row needs to be copied out to the resulting digests array
+  /// Computes the needed offsets using segments model
+  template <typename L, typename D>
+  cudaError_t maybe_copy_digests(D* digests, L* big_tree_digests, SubtreeParams<L, D>& params)
+  {
+    if (!params.keep_rows || params.subtree_height < params.keep_rows + (int)params.caps_mode) {
+      D* digests_with_offset = big_tree_digests + params.segment_offset +
+                               params.subtree_idx * params.number_of_rows_padded * params.digest_elements;
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        digests_with_offset, digests, params.number_of_rows_padded * params.digest_elements * sizeof(D),
+        cudaMemcpyDeviceToHost, params.ctx->stream));
+      params.segment_offset += params.segment_size;
+    }
+    return CHK_LAST();
+  }
+
+  template <typename L, typename D>
+  cudaError_t fold_layer(
+    const std::vector<Matrix<L>>& leaves,
+    D* prev_layer,
+    D* next_layer,
+    L* aux_leaves_mem,
+    Matrix<L>* d_leaves_info,
+    SubtreeParams<L, D>& params)
+  {
+    CHK_IF_RETURN(slice_and_copy_leaves<L>(leaves, aux_leaves_mem, d_leaves_info, params));
+
+    if (params.number_of_leaves_to_inject) {
+      CHK_IF_RETURN(params.compression->compress_and_inject(
+        d_leaves_info, params.number_of_leaves_to_inject, params.number_of_rows, prev_layer, next_layer,
+        params.digest_elements, *params.ctx));
+    } else {
+      CHK_IF_RETURN(params.compression->run_hash_many_kernel(
+        prev_layer, next_layer, params.number_of_rows_padded, params.compression->width, params.digest_elements,
+        *params.ctx));
+    }
+
+    return CHK_LAST();
+  }
+
+  template <typename L, typename D>
+  cudaError_t build_mmcs_subtree(
+    const std::vector<Matrix<L>>& leaves,
+    L* d_leaves,
+    D* states,
+    L* aux_leaves_mem,
+    L* big_tree_digests,
+    SubtreeParams<L, D>& params)
+  {
+    // Leaves info
+    Matrix<L>* d_leaves_info;
+    CHK_IF_RETURN(cudaMallocAsync(&d_leaves_info, params.number_of_inputs * sizeof(Matrix<L>), params.ctx->stream));
+
+    CHK_IF_RETURN(slice_and_copy_leaves(leaves, d_leaves, d_leaves_info, params));
+
+    // Reuse leaves memory
+    D* digests = (D*)d_leaves;
+
+    CHK_IF_RETURN(hash_leaves(
+      d_leaves_info, params.number_of_leaves_to_inject, params.number_of_rows, states, params.digest_elements,
+      *params.hasher, *params.ctx));
+
+    CHK_IF_RETURN(maybe_copy_digests(digests, big_tree_digests, params));
+
+    params.number_of_rows_padded /= params.arity;
+    params.segment_size /= params.arity;
+    params.subtree_height--;
+
+    D* prev_layer = states;
+    D* next_layer = digests;
+    while (params.number_of_rows_padded > 0) {
+      CHK_IF_RETURN(fold_layer(leaves, prev_layer, next_layer, aux_leaves_mem, d_leaves_info, params));
+      CHK_IF_RETURN(maybe_copy_digests(next_layer, big_tree_digests, params));
+      swap<D>(&prev_layer, &next_layer);
+      params.segment_size /= params.arity;
+      params.subtree_height--;
+      params.number_of_rows_padded /= params.arity;
+    }
+    return CHK_LAST();
+  }
+
+  template <typename L, typename D>
+  cudaError_t mmcs_commit(
+    const Matrix<L>* inputs,
+    const unsigned int number_of_inputs,
+    D* digests,
+    const SpongeHasher<L, D>& hasher,
+    const SpongeHasher<L, D>& compression,
+    const TreeBuilderConfig& tree_config)
+  {
+    CHK_INIT_IF_RETURN();
+    cudaStream_t& stream = tree_config.ctx.stream;
+
+    if (number_of_inputs == 0) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "No matrices provided");
+
+    if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument,
+        "Hash max preimage length does not match merkle tree arity multiplied by digest elements");
+
+    std::vector<Matrix<L>> sorted_inputs(number_of_inputs);
+    std::partial_sort_copy(
+      inputs, inputs + number_of_inputs, sorted_inputs.begin(), sorted_inputs.end(),
+      [](const Matrix<L>& left, const Matrix<L>& right) { return left.height > right.height; });
+
+    // Check that the height of any two given matrices either rounds up
+    // to the same next power of two or otherwise equal
+    for (unsigned int i = 0; i < number_of_inputs - 1; i++) {
+      unsigned int left = sorted_inputs[i].height;
+      unsigned int right = sorted_inputs[i + 1].height;
+
+      if (next_pow_of_two(left) == next_pow_of_two(right) && left != right)
+        THROW_ICICLE_ERR(
+          IcicleError_t::InvalidArgument, "Matrix heights that round up to the same power of two must be equal");
+    }
+
+    uint64_t max_height = sorted_inputs[0].height;
+
+    // Calculate maximum additional memory needed for injected matrices
+    uint64_t max_aux_total_elements = 0;
+    uint64_t current_aux_total_elements = 0;
+    uint64_t current_height = 0;
+    uint64_t bottom_layer_leaves_elements = 0;
+    if (!tree_config.are_inputs_on_device) {
+      for (auto it = sorted_inputs.begin(); it < sorted_inputs.end(); it++) {
+        if (it->height == max_height) {
+          bottom_layer_leaves_elements += it->height * it->width;
+          continue;
+        }
+
+        if (it->height != current_height) {
+          current_height = it->height;
+          current_aux_total_elements = 0;
+        }
+
+        current_aux_total_elements += it->width * it->height;
+        if (current_aux_total_elements > max_aux_total_elements) {
+          max_aux_total_elements = current_aux_total_elements;
+        }
+      }
+    }
+
+    uint64_t number_of_bottom_layer_rows = next_pow_of_two(max_height);
+    size_t leaves_info_memory = number_of_inputs * sizeof(Matrix<L>);
+
+    unsigned int tree_height = get_height(number_of_bottom_layer_rows);
+
+    // This will determine how much splitting do we need to do
+    // `number_of_streams` subtrees should fit in the device
+    // This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
+    uint64_t number_of_subtrees = 1;
+    uint64_t subtree_height = tree_height;
+    uint64_t subtree_bottom_layer_rows = number_of_bottom_layer_rows;
+    uint64_t subtree_states_size = subtree_bottom_layer_rows * hasher.width;
+    uint64_t subtree_digests_size = subtree_bottom_layer_rows * tree_config.digest_elements;
+    uint64_t subtree_leaves_elements = 0;
+    for (int i = 0; i < number_of_inputs && sorted_inputs[i].height == max_height; i++) {
+      subtree_leaves_elements += sorted_inputs[i].width * sorted_inputs[i].height;
+    }
+    uint64_t subtree_aux_elements = max_aux_total_elements;
+
+    size_t subtree_leaves_memory = std::max(subtree_leaves_elements * sizeof(L), subtree_digests_size * sizeof(D));
+    size_t subtree_memory_required =
+      sizeof(D) * subtree_states_size + subtree_leaves_memory + subtree_aux_elements * sizeof(L) + leaves_info_memory;
+    while (subtree_memory_required > STREAM_CHUNK_SIZE) {
+      number_of_subtrees *= tree_config.arity;
+      subtree_height--;
+      subtree_bottom_layer_rows /= tree_config.arity;
+      subtree_states_size /= tree_config.arity;
+      subtree_digests_size /= tree_config.arity;
+      subtree_leaves_elements /= tree_config.arity;
+      subtree_aux_elements /= tree_config.arity;
+      subtree_leaves_memory = std::max(subtree_leaves_elements * sizeof(L), subtree_digests_size * sizeof(D));
+      subtree_memory_required =
+        sizeof(D) * subtree_states_size + subtree_leaves_memory + subtree_aux_elements * sizeof(L) + leaves_info_memory;
+    }
+    unsigned int cap_height = tree_height - subtree_height;
+    size_t caps_len = pow(tree_config.arity, cap_height) * tree_config.digest_elements;
+
+    size_t available_memory, _total_memory;
+    CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
+    if (available_memory < (GIGA / 8 + STREAM_CHUNK_SIZE)) {
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument,
+        "Not enough GPU memory to build a tree. At least 1.125 GB of GPU memory required");
+    }
+    available_memory -= GIGA / 8; // Leave 128 MB just in case
+
+    // We can effectively parallelize memory copy with streams
+    // as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
+    const size_t number_of_streams = std::min((uint64_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
+    std::vector<cudaStream_t> streams(number_of_streams);
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
+    }
+
+    // If keep_rows is smaller then the remaining top-tree height
+    // we need to allocate additional memory to store the roots
+    // of subtrees, in order to proceed from there
+    bool caps_mode = tree_config.keep_rows && tree_config.keep_rows <= cap_height;
+    D* caps;
+    if (caps_mode) { caps = static_cast<D*>(malloc(caps_len * sizeof(D))); }
+
+#ifdef MERKLE_DEBUG
+    std::cout << "MMCS DEBUG" << std::endl;
+    std::cout << "====================================" << std::endl;
+    std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Number of streams = " << number_of_streams << std::endl;
+    std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
+    std::cout << "Height of a subtree = " << subtree_height << std::endl;
+    std::cout << "Cutoff height = " << tree_height - subtree_height << std::endl;
+    std::cout << "Number of leaves in a subtree = " << subtree_bottom_layer_rows << std::endl;
+    std::cout << "State of a subtree = " << subtree_states_size << std::endl;
+    std::cout << "Digest elements for a subtree = " << subtree_digests_size << std::endl;
+    std::cout << "Size of 1 subtree states = " << subtree_states_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
+    std::cout << "Cap height = " << cap_height << std::endl;
+    std::cout << "Enabling caps mode? " << caps_mode << std::endl;
+
+    std::cout << "Allocating " << subtree_states_size * number_of_streams << " elements for states" << std::endl;
+    std::cout << "Allocating " << subtree_leaves_memory * number_of_streams << " bytes for leaves" << std::endl;
+    std::cout << "Allocating " << subtree_aux_elements * number_of_streams << " elements for aux leaves" << std::endl;
+    std::cout << std::endl;
+#endif
+
+    // Allocate memory for the states, injected leaves (aux) and digests
+    // These are shared by streams in a pool
+    D* states_ptr;
+    L *aux_ptr, *leaves_ptr;
+    CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_states_size * number_of_streams * sizeof(D), stream));
+    CHK_IF_RETURN(cudaMemsetAsync(states_ptr, 0, subtree_states_size * number_of_streams * sizeof(D), stream));
+    CHK_IF_RETURN(cudaMallocAsync(&leaves_ptr, subtree_leaves_memory * number_of_streams, stream));
+    CHK_IF_RETURN(cudaMallocAsync(&aux_ptr, subtree_aux_elements * number_of_streams * sizeof(L), stream));
+    // Wait for these allocations to finish
+    CHK_IF_RETURN(cudaStreamSynchronize(stream));
+
+    // Build subtrees in parallel. This for loop invokes kernels that can run in a pool of size `number_of_streams`
+    for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
+      size_t stream_idx = subtree_idx % number_of_streams;
+      cudaStream_t subtree_stream = streams[stream_idx];
+
+      D* subtree_state = states_ptr + stream_idx * subtree_states_size;
+      L* subtree_leaves = (L*)((unsigned char*)leaves_ptr + stream_idx * subtree_leaves_memory);
+      L* subtree_aux = aux_ptr + stream_idx * subtree_aux_elements;
+
+      unsigned int subtree_keep_rows = 0;
+      if (tree_config.keep_rows) {
+        int diff = tree_config.keep_rows - cap_height;
+        subtree_keep_rows = std::max(1, diff);
+      }
+      device_context::DeviceContext subtree_context{subtree_stream, tree_config.ctx.device_id, tree_config.ctx.mempool};
+
+      SubtreeParams<L, D> params = {};
+
+      params.number_of_inputs = number_of_inputs;
+      params.arity = tree_config.arity;
+      params.digest_elements = tree_config.digest_elements;
+      params.number_of_rows = subtree_bottom_layer_rows;
+      params.number_of_rows_padded = subtree_bottom_layer_rows;
+
+      params.subtree_idx = subtree_idx;
+      params.subtree_height = subtree_height;
+      params.number_of_subtrees = number_of_subtrees;
+
+      params.segment_size = number_of_bottom_layer_rows * tree_config.digest_elements;
+      params.keep_rows = subtree_keep_rows;
+      params.are_inputs_on_device = tree_config.are_inputs_on_device;
+      params.hasher = &hasher;
+      params.compression = &compression;
+      params.ctx = &subtree_context;
+
+      cudaError_t subtree_result = build_mmcs_subtree<L, D>(
+        sorted_inputs,
+        subtree_leaves,             // d_leaves
+        subtree_state,              // states
+        subtree_aux,                // aux_leaves_mem
+        caps_mode ? caps : digests, // big_tree_digests
+        params                      // params
+      );
+      CHK_IF_RETURN(subtree_result);
+    }
+
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
+    }
+
+    // Finish the top-level tree if any
+    if (cap_height > 0) {
+      D* digests_ptr = (D*)leaves_ptr;
+      size_t start_segment_size = caps_len / tree_config.arity;
+      size_t start_segment_offset = 0;
+      if (!caps_mode) { // Calculate offset
+        size_t keep_rows = tree_config.keep_rows ? tree_config.keep_rows : tree_height + 1;
+        size_t layer_size = pow(tree_config.arity, keep_rows - 1) * tree_config.digest_elements;
+        for (int i = 0; i < keep_rows - cap_height; i++) {
+          start_segment_offset += layer_size;
+          layer_size /= tree_config.arity;
+        }
+      }
+
+      CHK_IF_RETURN(cudaMemcpyAsync(
+        states_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
+        (caps_mode || !tree_config.are_outputs_on_device) ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, stream));
+
+      uint64_t number_of_states = caps_len / tree_config.arity / tree_config.digest_elements;
+      Matrix<L>* d_leaves_info;
+      CHK_IF_RETURN(cudaMallocAsync(&d_leaves_info, number_of_inputs * sizeof(Matrix<L>), tree_config.ctx.stream));
+
+      SubtreeParams<L, D> top_params = {};
+
+      top_params.number_of_inputs = number_of_inputs;
+      top_params.arity = tree_config.arity;
+      top_params.digest_elements = tree_config.digest_elements;
+      top_params.number_of_rows = number_of_states;
+      top_params.number_of_rows_padded = number_of_states;
+
+      top_params.subtree_height = cap_height;
+      top_params.number_of_subtrees = 1;
+
+      top_params.segment_offset = start_segment_offset;
+      top_params.segment_size = start_segment_size;
+      top_params.keep_rows = tree_config.keep_rows;
+      top_params.are_inputs_on_device = tree_config.are_inputs_on_device;
+      top_params.caps_mode = caps_mode;
+      top_params.hasher = &hasher;
+      top_params.compression = &compression;
+      top_params.ctx = &tree_config.ctx;
+
+      D* prev_layer = states_ptr;
+      D* next_layer = digests_ptr;
+      while (top_params.number_of_rows_padded > 0) {
+        CHK_IF_RETURN(fold_layer(sorted_inputs, prev_layer, next_layer, aux_ptr, d_leaves_info, top_params));
+        CHK_IF_RETURN(maybe_copy_digests(next_layer, digests, top_params));
+        swap<D>(&prev_layer, &next_layer);
+        top_params.segment_size /= top_params.arity;
+        top_params.subtree_height--;
+        top_params.number_of_rows_padded /= top_params.arity;
+      }
+    }
+
+    if (caps_mode) { free(caps); }
+    CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
+    CHK_IF_RETURN(cudaFreeAsync(leaves_ptr, stream));
+    for (size_t i = 0; i < number_of_streams; i++) {
+      CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
+    }
+    if (!tree_config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
+    return CHK_LAST();
+  }
+
+} // namespace merkle_tree
--- a/icicle/src/merkle-tree/tests/merkle/.gitignore
+++ b/icicle/src/merkle-tree/tests/merkle/.gitignore
@@ -0,0 +1,7 @@
+merkle.o
+poseidon2.o
+test_merkle_poseidon2
+merkle_bls.o
+poseidon.o
+test_merkle_poseidon
+test_merkle
--- a/icicle/src/merkle-tree/tests/merkle/Makefile
+++ b/icicle/src/merkle-tree/tests/merkle/Makefile
@@ -0,0 +1,23 @@
+test_merkle_poseidon: poseidon.o merkle_bls.o 
+	nvcc -o test_merkle_poseidon -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -DMERKLE_DEBUG poseidon.o merkle_bls.o test.cu
+	./test_merkle_poseidon
+
+merkle_bls.o: ../../extern.cu ../../merkle.cu
+	nvcc -o merkle_bls.o -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -DMERKLE_DEBUG -c ../../extern.cu
+
+poseidon.o: ../../../poseidon/extern.cu
+	nvcc -o poseidon.o -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -c ../../../poseidon/extern.cu
+
+
+test_merkle: poseidon2.o merkle.o 
+	nvcc -o test_merkle -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG poseidon2.o merkle.o test_poseidon2.cu
+	./test_merkle
+
+merkle.o: ../../extern.cu ../../merkle.cu
+	nvcc -o merkle.o -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG -c ../../extern.cu
+
+poseidon2.o: ../../../poseidon2/extern.cu
+	nvcc -o poseidon2.o -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -c ../../../poseidon2/extern.cu
+
+clear:
+	rm -f poseidon2.o merkle.o test_merkle merkle_bls.o poseidon.o test_merkle
--- a/icicle/src/merkle-tree/tests/merkle/test.cu
+++ b/icicle/src/merkle-tree/tests/merkle/test.cu
@@ -1,10 +1,3 @@
-// #define DEBUG
-#define MERKLE_DEBUG
-
-#include "curves/curve_config.cuh"
-#include "../poseidon.cu"
-#include "merkle.cu"
-
 #ifndef __CUDA_ARCH__
 #include <cassert>
 #include <chrono>
@@ -12,15 +5,18 @@
 #include <iostream>
 #include <math.h>

-using namespace poseidon;
-using namespace merkle;
-using namespace curve_config;
-using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#include "merkle-tree/merkle.cuh"
+
+#include "poseidon/poseidon.cuh"
+
+#include "api/bls12_381.h"
+using namespace bls12_381;

 // Arity
 #define A 2
 #define T (A + 1)

+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
 #define END_TIMER(timer, msg)                                                                                          \
  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
@@ -30,24 +26,24 @@ int main(int argc, char* argv[])
  // Load poseidon constants
  START_TIMER(timer_const);
  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  PoseidonConstants<scalar_t> constants;
-  init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
+  poseidon::Poseidon<scalar_t> poseidon(A, ctx);
  END_TIMER(timer_const, "Load poseidon constants");

  /// Tree of height N and arity A contains \sum{A^i} for i in 0..N-1 elements
-  uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 28;
-  uint32_t number_of_leaves = pow(A, (tree_height - 1));
+  uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 26;
+  uint32_t number_of_leaves = pow(A, tree_height);
+  uint32_t total_number_of_leaves = number_of_leaves * A;

  /// Use keep_rows to specify how many rows do you want to store
  int keep_rows = argc > 2 ? atoi(argv[2]) : 7;
-  size_t digests_len = get_digests_len<scalar_t>(keep_rows + 1, A);
+  size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, A, 1);

-  /// Fill leaves with scalars [0, 1, ... 2^{tree_height - 1} - 1]
+  /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
  START_TIMER(timer_allocation);
  scalar_t input = scalar_t::zero();
-  size_t leaves_mem = number_of_leaves * sizeof(scalar_t);
+  size_t leaves_mem = total_number_of_leaves * sizeof(scalar_t);
  scalar_t* leaves = static_cast<scalar_t*>(malloc(leaves_mem));
-  for (uint32_t i = 0; i < number_of_leaves; i++) {
+  for (uint32_t i = 0; i < total_number_of_leaves; i++) {
    leaves[i] = input;
    input = input + scalar_t::one();
  }
@@ -62,6 +58,7 @@ int main(int argc, char* argv[])
  std::cout << "Memory for leaves = " << leaves_mem / 1024 / 1024 << " MB; " << leaves_mem / 1024 / 1024 / 1024 << " GB"
            << std::endl;
  std::cout << "Number of leaves = " << number_of_leaves << std::endl;
+  std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
  std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
            << " GB" << std::endl;
  std::cout << "Number of digest elements = " << digests_len << std::endl;
@@ -69,12 +66,17 @@ int main(int argc, char* argv[])
  std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
            << (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;

-  TreeBuilderConfig config = default_merkle_config();
-  config.keep_rows = keep_rows;
+  merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
+  tree_config.arity = 2;
+  tree_config.keep_rows = keep_rows;
  START_TIMER(timer_merkle);
-  build_merkle_tree<scalar_t, T>(leaves, digests, tree_height, constants, config);
+  bls12_381_build_merkle_tree(leaves, digests, tree_height, A, &poseidon, &poseidon, tree_config);
  END_TIMER(timer_merkle, "Merkle tree built: ")

+  for (int i = 0; i < digests_len; i++) {
+    std::cout << digests[i] << std::endl;
+  }
+
  // Use this to generate test vectors
  // for (int i = 0; i < digests_len; i++) {
  //   std::cout << "{";
--- a/icicle/src/merkle-tree/tests/merkle/test_poseidon2.cu
+++ b/icicle/src/merkle-tree/tests/merkle/test_poseidon2.cu
@@ -0,0 +1,108 @@
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+
+#include "merkle-tree/merkle.cuh"
+
+#include "poseidon2/poseidon2.cuh"
+
+#include "api/babybear.h"
+using namespace babybear;
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char* argv[])
+{
+  /// Tree of height N and arity A contains \sum{A^i} for i in 0..N elements
+  uint32_t tree_arity = 2;
+  uint32_t width = 16;
+  uint32_t input_block_len = 8;
+  uint32_t digest_elements = 8;
+  uint64_t tree_height = argc > 1 ? atoi(argv[1]) : 23;
+  uint64_t number_of_leaves = pow(tree_arity, tree_height);
+  uint64_t total_number_of_leaves = number_of_leaves * input_block_len;
+
+  // Load poseidon constants
+  START_TIMER(timer_const);
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  poseidon2::Poseidon2<scalar_t> poseidon(
+    width, input_block_len, poseidon2::MdsType::DEFAULT_MDS, poseidon2::DiffusionStrategy::DEFAULT_DIFFUSION, ctx);
+  END_TIMER(timer_const, "Load poseidon constants");
+
+  /// Use keep_rows to specify how many rows do you want to store
+  int keep_rows = argc > 2 ? atoi(argv[2]) : 3;
+  size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, tree_arity, digest_elements);
+
+  /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
+  START_TIMER(timer_allocation);
+  scalar_t input = scalar_t::zero();
+  size_t leaves_mem = total_number_of_leaves * sizeof(scalar_t);
+  scalar_t* leaves = static_cast<scalar_t*>(malloc(leaves_mem));
+  for (uint64_t i = 0; i < total_number_of_leaves; i++) {
+    leaves[i] = input;
+    input = input + scalar_t::one();
+  }
+  END_TIMER(timer_allocation, "Allocated memory for leaves: ");
+
+  /// Allocate memory for digests of {keep_rows} rows of a tree
+  START_TIMER(timer_digests);
+  size_t digests_mem = digests_len * sizeof(scalar_t);
+  scalar_t* digests = static_cast<scalar_t*>(malloc(digests_mem));
+  END_TIMER(timer_digests, "Allocated memory for digests");
+
+  std::cout << "Memory for leaves = " << leaves_mem / 1024 / 1024 << " MB; " << leaves_mem / 1024 / 1024 / 1024 << " GB"
+            << std::endl;
+  std::cout << "Number of leaves = " << number_of_leaves << std::endl;
+  std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
+  std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
+            << " GB" << std::endl;
+  std::cout << "Number of digest elements = " << digests_len << std::endl;
+
+  std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
+            << (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;
+
+  merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
+  tree_config.arity = tree_arity;
+  tree_config.keep_rows = keep_rows;
+  tree_config.digest_elements = digest_elements;
+  START_TIMER(timer_merkle);
+  babybear_build_merkle_tree(leaves, digests, tree_height, input_block_len, &poseidon, &poseidon, tree_config);
+  END_TIMER(timer_merkle, "Merkle tree built: ")
+
+  for (int i = 0; i < digests_len; i++) {
+    // std::cout << digests[i] << std::endl;
+  }
+
+  // Use this to generate test vectors
+  // for (int i = 0; i < digests_len; i++) {
+  //   std::cout << "{";
+  //   for (int j = 0; j < 1; j++) {
+  //     std::cout << ((uint32_t*)&digests[i].limbs_storage)[j];
+  //   }
+  //   std::cout << "}," << std::endl;
+  // }
+
+  scalar_t expected[64] = {
+    {1198029810}, {1114813365}, {241588005},  {735332587},  {201392606},  {623383436},  {60086186},   {1225304654},
+    {1501472115}, {891216097},  {184481194},  {855632748},  {1503541944}, {1483537725}, {1023563730}, {698957505},
+    {1322038939}, {1132881200}, {104782797},  {68847168},   {420051722},  {126069919},  {1350263697}, {1711085395},
+    {1322038939}, {1132881200}, {104782797},  {68847168},   {420051722},  {126069919},  {1350263697}, {1711085395},
+    {1019525203}, {127215304},  {1199733491}, {1473997036}, {548538385},  {364347137},  {570748364},  {426431873},
+    {926562920},  {6278762},    {1894248581}, {1304248433}, {1635020421}, {719342960},  {1373719279}, {700539301},
+    {708916911},  {925660920},  {994927540},  {1925434995}, {208534303},  {69614512},   {1701199215}, {1825115630}};
+
+  for (int i = 0; i < digests_len; i++) {
+    scalar_t root = digests[i];
+    assert(root == expected[i]);
+  }
+  free(digests);
+  free(leaves);
+}
+
+#endif
--- a/icicle/src/merkle-tree/tests/mmcs/.gitignore
+++ b/icicle/src/merkle-tree/tests/mmcs/.gitignore
@@ -0,0 +1,4 @@
+mmcs.o
+poseidon2.o
+test_mmcs_poseidon2
+vec_ops.o
--- a/icicle/src/merkle-tree/tests/mmcs/Makefile
+++ b/icicle/src/merkle-tree/tests/mmcs/Makefile
@@ -0,0 +1,15 @@
+test_merkle: poseidon2.o mmcs.o vec_ops.o
+	nvcc -o test_mmcs_poseidon2 -lineinfo -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG poseidon2.o vec_ops.o mmcs.o test_poseidon2.cu
+	./test_mmcs_poseidon2
+
+mmcs.o: ../../extern_mmcs.cu ../../mmcs.cu
+	nvcc -o mmcs.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG -c ../../extern_mmcs.cu
+
+poseidon2.o: ../../../poseidon2/extern.cu
+	nvcc -o poseidon2.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -c ../../../poseidon2/extern.cu
+
+vec_ops.o:
+	nvcc -o vec_ops.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -c ../../../vec_ops/extern.cu
+
+clear:
+	rm -f poseidon2.o mmcs.o vec_ops.o test_mmcs_poseidon2
--- a/icicle/src/merkle-tree/tests/mmcs/test_poseidon2.cu
+++ b/icicle/src/merkle-tree/tests/mmcs/test_poseidon2.cu
@@ -0,0 +1,139 @@
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+
+#include "merkle-tree/merkle.cuh"
+
+#include "poseidon2/poseidon2.cuh"
+
+#include "api/babybear.h"
+using namespace babybear;
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char* argv[])
+{
+  /// Tree of height N and arity A contains \sum{A^i} for i in 0..N elements
+  uint32_t tree_arity = 2;
+  uint32_t width = 16;
+  uint32_t input_block_len = 600;
+  uint32_t rate = 8;
+  uint32_t digest_elements = 8;
+  uint32_t copied_matrices = 1;
+  uint64_t tree_height = argc > 1 ? atoi(argv[1]) : 3;
+  uint64_t number_of_leaves = pow(tree_arity, tree_height);
+  uint64_t total_number_of_leaves = number_of_leaves * input_block_len;
+
+  bool are_inputs_on_device = true;
+
+  // Load poseidon constants
+  START_TIMER(timer_const);
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  poseidon2::Poseidon2<scalar_t> poseidon(
+    width, rate, poseidon2::MdsType::PLONKY, poseidon2::DiffusionStrategy::MONTGOMERY, ctx);
+  END_TIMER(timer_const, "Load poseidon constants");
+
+  /// Use keep_rows to specify how many rows do you want to store
+  int keep_rows = argc > 2 ? atoi(argv[2]) : 3;
+  size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, tree_arity, digest_elements);
+
+  /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
+  START_TIMER(timer_allocation);
+  scalar_t input = scalar_t::zero();
+
+  // unsigned int number_of_inputs = tree_height * copied_matrices;
+  unsigned int number_of_inputs = 1;
+  Matrix<scalar_t>* leaves = static_cast<Matrix<scalar_t>*>(malloc(number_of_inputs * sizeof(Matrix<scalar_t>)));
+  uint64_t current_matrix_rows = number_of_leaves;
+  for (int i = 0; i < number_of_inputs; i++) {
+    uint64_t current_matrix_size = current_matrix_rows * input_block_len;
+    for (int j = 0; j < copied_matrices; j++) {
+      scalar_t* matrix = static_cast<scalar_t*>(malloc(current_matrix_size * sizeof(scalar_t)));
+
+      for (uint64_t k = 0; k < current_matrix_size; k++) {
+        matrix[k] = input;
+        input = input + scalar_t::one();
+      }
+
+      scalar_t* d_matrix;
+      if (are_inputs_on_device) {
+        cudaMalloc(&d_matrix, current_matrix_size * sizeof(scalar_t));
+        cudaMemcpy(d_matrix, matrix, current_matrix_size * sizeof(scalar_t), cudaMemcpyHostToDevice);
+      }
+
+      leaves[i * copied_matrices + j] = {
+        are_inputs_on_device ? d_matrix : matrix,
+        input_block_len,
+        current_matrix_rows,
+      };
+    }
+
+    current_matrix_rows /= tree_arity;
+  }
+
+  END_TIMER(timer_allocation, "Allocated memory for leaves: ");
+
+  /// Allocate memory for digests of {keep_rows} rows of a tree
+  START_TIMER(timer_digests);
+  size_t digests_mem = digests_len * sizeof(scalar_t);
+  scalar_t* digests = static_cast<scalar_t*>(malloc(digests_mem));
+  END_TIMER(timer_digests, "Allocated memory for digests");
+
+  // std::cout << "Memory for leaves = " << total_number_of_leaves * sizeof(scalar_t) / 1024 / 1024 << " MB; " <<
+  // leaves_mem / 1024 / 1024 / 1024 << " GB"
+  //           << std::endl;
+  std::cout << "Number of leaves = " << number_of_leaves << std::endl;
+  std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
+  std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
+            << " GB" << std::endl;
+  std::cout << "Number of digest elements = " << digests_len << std::endl;
+  std::cout << std::endl;
+
+  // std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
+  //           << (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;
+
+  merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
+  tree_config.are_inputs_on_device = are_inputs_on_device;
+  tree_config.arity = tree_arity;
+  tree_config.keep_rows = keep_rows;
+  tree_config.digest_elements = digest_elements;
+  START_TIMER(timer_merkle);
+  babybear_mmcs_commit_cuda(leaves, number_of_inputs, digests, &poseidon, &poseidon, tree_config);
+  END_TIMER(timer_merkle, "Merkle tree built: ")
+
+  for (int i = 0; i < 10; i++) {
+    std::cout << digests[digests_len - i - 1] << std::endl;
+  }
+
+  // Use this to generate test vectors
+  // for (int i = 0; i < digests_len; i++) {
+  //   std::cout << "{";
+  //   for (int j = 0; j < 8; j++) {
+  //     std::cout << ((uint64_t*)&digests[i].limbs_storage)[j];
+  //     if (j != 7) { std::cout << ", "; }
+  //   }
+  //   std::cout << "}," << std::endl;
+  // }
+
+  /// These scalars are digests of top-7 rows of a Merkle tree.
+  /// Arity = 2, Tree height = 28, keep_rows = 7
+  /// They are aligned in the following format:
+  ///  L-7      L-6     L-5       L-4       L-3       L-2    L-1
+  /// [0..63, 64..95, 96..111, 112..119, 120..123, 124..125, 126]
+  scalar_t expected[0] = {};
+
+  for (int i = 0; i < digests_len; i++) {
+    scalar_t root = digests[i];
+    // assert(root == expected[i]);
+  }
+  free(digests);
+  free(leaves);
+}
+
+#endif
--- a/icicle/src/poseidon/Makefile
+++ b/icicle/src/poseidon/Makefile
@@ -1,2 +1,5 @@
-test_poseidon : test.cu poseidon.cu kernels.cu constants.cu nvcc - o test_poseidon - I../../ include - DFIELD_ID =
-  2 - DCURVE_ID = 2 test.cu./ test_poseidon
+test_poseidon: test.cu
+	nvcc -o test_poseidon -I../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE_ID=2 -DDEVMODE -DDEBUG extern.cu test.cu
+
+test_poseidon_m31: test_m31.cu
+	nvcc -o test_poseidon -I../../include -DFIELD=m31 -DFIELD_ID=1003 -DDEVMODE -DDEBUG extern.cu test_m31.cu
--- a/icicle/src/poseidon/constants.cu
+++ b/icicle/src/poseidon/constants.cu
@@ -1,4 +1,5 @@
-#include "poseidon/poseidon.cuh"
+#include "poseidon/constants.cuh"
+#include "gpu-utils/device_context.cuh"

 /// These are pre-calculated constants for different curves
 #include "fields/id.h"
@@ -17,17 +18,25 @@ using namespace poseidon_constants_bw6_761;
 #elif FIELD_ID == GRUMPKIN
 #include "poseidon/constants/grumpkin_poseidon.h"
 using namespace poseidon_constants_grumpkin;
+#elif FIELD_ID == M31
+#include "poseidon/constants/m31_poseidon.h"
+using namespace poseidon_constants_m31;
 #endif

 namespace poseidon {
  template <typename S>
  cudaError_t create_optimized_poseidon_constants(
-    int arity,
-    int full_rounds_half,
-    int partial_rounds,
-    const S* constants,
-    device_context::DeviceContext& ctx,
-    PoseidonConstants<S>* poseidon_constants)
+    unsigned int arity,
+    unsigned int alpha,
+    unsigned int partial_rounds,
+    unsigned int full_rounds_half,
+    const S* round_constants,
+    const S* mds_matrix,
+    const S* non_sparse_matrix,
+    const S* sparse_matrices,
+    const S domain_tag,
+    PoseidonConstants<S>* poseidon_constants,
+    device_context::DeviceContext& ctx)
  {
    CHK_INIT_IF_RETURN();
    cudaStream_t& stream = ctx.stream;
@@ -41,24 +50,33 @@ namespace poseidon {
    S* d_constants;
    CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));

+    S* d_round_constants = d_constants;
+    S* d_mds_matrix = d_round_constants + round_constants_len;
+    S* d_non_sparse_matrix = d_mds_matrix + mds_matrix_len;
+    S* d_sparse_matrices = d_non_sparse_matrix + mds_matrix_len;
+
    // Copy constants
-    CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
-
-    S* round_constants = d_constants;
-    S* mds_matrix = round_constants + round_constants_len;
-    S* non_sparse_matrix = mds_matrix + mds_matrix_len;
-    S* sparse_matrices = non_sparse_matrix + mds_matrix_len;
-
-    // Pick the domain_tag accordinaly
-    // For now, we only support Merkle tree mode
-    uint32_t tree_domain_tag_value = 1;
-    tree_domain_tag_value = (tree_domain_tag_value << (width - 1)) - tree_domain_tag_value;
-    S domain_tag = S::from(tree_domain_tag_value);
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      d_round_constants, round_constants, sizeof(S) * round_constants_len, cudaMemcpyHostToDevice, stream));
+    CHK_IF_RETURN(
+      cudaMemcpyAsync(d_mds_matrix, mds_matrix, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice, stream));
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      d_non_sparse_matrix, non_sparse_matrix, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice, stream));
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      d_sparse_matrices, sparse_matrices, sizeof(S) * sparse_matrices_len, cudaMemcpyHostToDevice, stream));

    // Make sure all the constants have been copied
    CHK_IF_RETURN(cudaStreamSynchronize(stream));
-    *poseidon_constants = {arity,      partial_rounds,    full_rounds_half, round_constants,
-                           mds_matrix, non_sparse_matrix, sparse_matrices,  domain_tag};
+    *poseidon_constants = {
+      arity,
+      alpha,
+      partial_rounds,
+      full_rounds_half,
+      d_round_constants,
+      d_mds_matrix,
+      d_non_sparse_matrix,
+      d_sparse_matrices,
+      domain_tag};

    return CHK_LAST();
  }
@@ -68,8 +86,8 @@ namespace poseidon {
    int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
  {
    CHK_INIT_IF_RETURN();
-    int full_rounds_half = FULL_ROUNDS_DEFAULT;
-    int partial_rounds;
+    unsigned int full_rounds_half = FULL_ROUNDS_DEFAULT;
+    unsigned int partial_rounds;
    unsigned char* constants;
    switch (arity) {
    case 2:
@@ -94,8 +112,41 @@ namespace poseidon {
    }
    S* h_constants = reinterpret_cast<S*>(constants);

-    create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);
+    unsigned int width = arity + 1;
+    unsigned int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
+    unsigned int mds_matrix_len = width * width;
+
+    S* round_constants = h_constants;
+    S* mds_matrix = round_constants + round_constants_len;
+    S* non_sparse_matrix = mds_matrix + mds_matrix_len;
+    S* sparse_matrices = non_sparse_matrix + mds_matrix_len;
+
+    // Pick the domain_tag accordinaly
+    // For now, we only support Merkle tree mode
+    uint32_t tree_domain_tag_value = 1;
+    tree_domain_tag_value = (tree_domain_tag_value << (width - 1)) - tree_domain_tag_value;
+    S domain_tag = S::from(tree_domain_tag_value);
+
+    create_optimized_poseidon_constants<S>(
+      arity, 5, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
+      domain_tag, poseidon_constants, ctx);

    return CHK_LAST();
  }
+
+  template <typename S>
+  cudaError_t release_optimized_poseidon_constants(PoseidonConstants<S>* constants, device_context::DeviceContext& ctx)
+  {
+    CHK_INIT_IF_RETURN();
+    CHK_IF_RETURN(cudaFreeAsync(constants->round_constants, ctx.stream));
+
+    constants->arity = 0;
+    constants->partial_rounds = 0;
+    constants->full_rounds_half = 0;
+    constants->round_constants = nullptr;
+    constants->mds_matrix = nullptr;
+    constants->non_sparse_matrix = nullptr;
+    constants->sparse_matrices = nullptr;
+    return CHK_LAST();
+  }
 } // namespace poseidon
--- a/icicle/src/poseidon/extern.cu
+++ b/icicle/src/poseidon/extern.cu
@@ -2,58 +2,68 @@

 using namespace field_config;

-#include "poseidon.cu"
+#include "poseidon/poseidon.cuh"
 #include "constants.cu"

 #include "gpu-utils/device_context.cuh"
 #include "utils/utils.h"

 namespace poseidon {
-  /**
-   * Extern "C" version of [poseidon_hash_cuda] function with the following
-   * value of template parameter (where the field is given by `-DFIELD` env variable during build):
-   *  - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
-   * stand-alone "STARK field";
-   * @return `cudaSuccess` if the execution was successful and an error code otherwise.
-   */
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
-    scalar_t* input,
-    scalar_t* output,
-    int number_of_states,
-    int arity,
-    const PoseidonConstants<scalar_t>& constants,
-    PoseidonConfig& config)
+  typedef class Poseidon<scalar_t> PoseidonInst;
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_create_cuda)(
+    PoseidonInst** poseidon,
+    unsigned int arity,
+    unsigned int alpha,
+    unsigned int partial_rounds,
+    unsigned int full_rounds_half,
+    const scalar_t* round_constants,
+    const scalar_t* mds_matrix,
+    const scalar_t* non_sparse_matrix,
+    const scalar_t* sparse_matrices,
+    const scalar_t& domain_tag,
+    device_context::DeviceContext& ctx)
  {
-    switch (arity) {
-    case 2:
-      return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
-    case 4:
-      return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
-    case 8:
-      return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
-    case 11:
-      return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
-    default:
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
+    try {
+      *poseidon = new PoseidonInst(
+        arity, alpha, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
+        domain_tag, ctx);
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
    }
-    return CHK_LAST();
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
-    int arity,
-    int full_rounds_half,
-    int partial_rounds,
-    const scalar_t* constants,
-    device_context::DeviceContext& ctx,
-    PoseidonConstants<scalar_t>* poseidon_constants)
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_load_cuda)(
+    PoseidonInst** poseidon, unsigned int arity, device_context::DeviceContext& ctx)
  {
-    return create_optimized_poseidon_constants<scalar_t>(
-      arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
+    try {
+      *poseidon = new PoseidonInst(arity, ctx);
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
+    }
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon_constants_cuda)(
-    int arity, device_context::DeviceContext& ctx, PoseidonConstants<scalar_t>* constants)
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_many_cuda)(
+    const PoseidonInst* poseidon,
+    const scalar_t* inputs,
+    scalar_t* output,
+    unsigned int number_of_states,
+    unsigned int input_block_len,
+    unsigned int output_len,
+    const SpongeConfig& cfg)
  {
-    return init_optimized_poseidon_constants<scalar_t>(arity, ctx, constants);
+    return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_delete_cuda)(PoseidonInst* poseidon)
+  {
+    try {
+      poseidon->~Poseidon();
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
+    }
  }
 } // namespace poseidon
--- a/icicle/src/poseidon/poseidon.cu
+++ b/icicle/src/poseidon/poseidon.cu
@@ -1,90 +0,0 @@
-#include "fields/field_config.cuh"
-
-using namespace field_config;
-
-#include "poseidon/poseidon.cuh"
-#include "kernels.cu"
-
-namespace poseidon {
-  template <typename S, int T>
-  cudaError_t
-  permute_many(S* states, size_t number_of_states, const PoseidonConstants<S>& constants, cudaStream_t& stream)
-  {
-    size_t rc_offset = 0;
-
-    full_rounds<S, T><<<
-      PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads,
-      sizeof(S) * PKC<T>::hashes_per_block * T, stream>>>(
-      states, number_of_states, rc_offset, FIRST_FULL_ROUNDS, constants);
-    rc_offset += T * (constants.full_rounds_half + 1);
-
-    partial_rounds<S, T>
-      <<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
-        states, number_of_states, rc_offset, constants);
-    rc_offset += constants.partial_rounds;
-
-    full_rounds<S, T><<<
-      PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads,
-      sizeof(S) * PKC<T>::hashes_per_block * T, stream>>>(
-      states, number_of_states, rc_offset, SECOND_FULL_ROUNDS, constants);
-    return CHK_LAST();
-  }
-
-  template <typename S, int T>
-  cudaError_t poseidon_hash(
-    S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config)
-  {
-    CHK_INIT_IF_RETURN();
-    cudaStream_t& stream = config.ctx.stream;
-    S* states;
-    if (config.input_is_a_state) {
-      states = input;
-    } else {
-      // allocate memory for {number_of_states} states of {t} scalars each
-      CHK_IF_RETURN(cudaMallocAsync(&states, number_of_states * T * sizeof(S), stream))
-
-      // This is where the input matrix of size Arity x NumberOfBlocks is
-      // padded and copied to device in a T x NumberOfBlocks matrix
-      CHK_IF_RETURN(cudaMemcpy2DAsync(
-        states, T * sizeof(S),                 // Device pointer and device pitch
-        input, (T - 1) * sizeof(S),            // Host pointer and pitch
-        (T - 1) * sizeof(S), number_of_states, // Size of the source matrix (Arity x NumberOfBlocks)
-        cudaMemcpyHostToDevice, stream));
-    }
-
-    S* output_device;
-    if (config.are_outputs_on_device) {
-      output_device = output;
-    } else {
-      CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_states * sizeof(S), stream))
-    }
-
-    prepare_poseidon_states<S, T>
-      <<<PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads, 0, stream>>>(
-        states, number_of_states, constants.domain_tag, config.aligned);
-
-    cudaError_t hash_error = permute_many<S, T>(states, number_of_states, constants, stream);
-    CHK_IF_RETURN(hash_error);
-
-    get_hash_results<S, T>
-      <<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
-        states, number_of_states, output_device);
-
-    if (config.loop_state) {
-      copy_recursive<S, T>
-        <<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
-          states, number_of_states, output_device);
-    }
-
-    if (!config.input_is_a_state) CHK_IF_RETURN(cudaFreeAsync(states, stream));
-
-    if (!config.are_outputs_on_device) {
-      CHK_IF_RETURN(
-        cudaMemcpyAsync(output, output_device, number_of_states * sizeof(S), cudaMemcpyDeviceToHost, stream));
-      CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
-    }
-
-    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
-    return CHK_LAST();
-  }
-} // namespace poseidon
--- a/icicle/src/poseidon/test.cu
+++ b/icicle/src/poseidon/test.cu
@@ -4,7 +4,6 @@
 using namespace curve_config;

 #include "gpu-utils/device_context.cuh"
-#include "poseidon.cu"

 #ifndef __CUDA_ARCH__
 #include <cassert>
@@ -12,6 +11,10 @@ using namespace curve_config;
 #include <fstream>
 #include <iostream>

+#include "api/bls12_381.h"
+using namespace bls12_381;
+
+#include "poseidon/poseidon.cuh"
 using namespace poseidon;

 #define A 2
@@ -29,8 +32,7 @@ int main(int argc, char* argv[])
  // Load poseidon constants
  START_TIMER(timer_const);
  device_context::DeviceContext ctx = device_context::get_default_device_context();
-  PoseidonConstants<scalar_t> constants;
-  init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
+  Poseidon<scalar_t> poseidon(A, ctx);
  END_TIMER(timer_const, "Load poseidon constants");

  START_TIMER(allocation_timer);
@@ -46,9 +48,10 @@ int main(int argc, char* argv[])

  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));

+  SpongeConfig cfg = default_sponge_config();
+
  START_TIMER(poseidon_timer);
-  PoseidonConfig config = default_poseidon_config(T);
-  poseidon_hash<curve_config::scalar_t, T>(in_ptr, out_ptr, number_of_blocks, constants, config);
+  poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, A, 1, cfg);
  END_TIMER(poseidon_timer, "Poseidon")

  scalar_t expected[1024] = {
@@ -1080,7 +1083,7 @@ int main(int argc, char* argv[])
  if (number_of_blocks == 1024) {
    for (int i = 0; i < number_of_blocks; i++) {
 #ifdef DEBUG
-      std::cout << out_ptr[i] << std::endl;
+      // std::cout << out_ptr[i] << std::endl;
 #endif
      assert((out_ptr[i] == expected[i]));
    }
--- a/icicle/src/poseidon/test_m31.cu
+++ b/icicle/src/poseidon/test_m31.cu
@@ -0,0 +1,70 @@
+// #define DEBUG
+
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+#include "gpu-utils/device_context.cuh"
+#include "poseidon/poseidon.cuh"
+
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+using namespace poseidon;
+
+#define A 11
+#define T (A + 1)
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char* argv[])
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  // Load poseidon constants
+  START_TIMER(timer_const);
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  PoseidonConstants<scalar_t> constants;
+  init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
+  END_TIMER(timer_const, "Load poseidon constants");
+
+  START_TIMER(allocation_timer);
+  // Prepare input data of [0, 1, 2 ... (number_of_blocks * arity) - 1]
+  int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
+  scalar_t input = scalar_t::zero();
+  scalar_t* in_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * A * sizeof(scalar_t)));
+  for (uint32_t i = 0; i < number_of_blocks * A; i++) {
+    in_ptr[i] = input;
+    input = input + scalar_t::one();
+  }
+  END_TIMER(allocation_timer, "Allocate mem and fill input");
+
+  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
+
+  START_TIMER(poseidon_timer);
+  PoseidonConfig config = default_poseidon_config(T);
+  poseidon_hash<field_config::scalar_t, T>(in_ptr, out_ptr, number_of_blocks, constants, config);
+  END_TIMER(poseidon_timer, "Poseidon")
+
+  // scalar_t expected[0] = {}
+
+  if (number_of_blocks == 1024) {
+    for (int i = 0; i < number_of_blocks; i++) {
+#ifdef DEBUG
+      // std::cout << out_ptr[i] << std::endl;
+#endif
+      // assert((out_ptr[i] == expected[i]));
+    }
+    printf("Expected output matches\n");
+  }
+
+  free(in_ptr);
+  free(out_ptr);
+}
+
+#endif
--- a/icicle/src/poseidon/tree/Makefile
+++ b/icicle/src/poseidon/tree/Makefile
@@ -1,3 +0,0 @@
-test_merkle:
-	nvcc -o test_merkle -I../../../include -DFIELD_ID=2 -DCURVE_ID=2 test.cu
-	./test_merkle
--- a/icicle/src/poseidon/tree/merkle.cu
+++ b/icicle/src/poseidon/tree/merkle.cu
@@ -1,284 +0,0 @@
-#include "fields/field_config.cuh"
-
-using namespace field_config;
-
-#include "poseidon/tree/merkle.cuh"
-
-namespace merkle {
-  /// Flattens the tree digests and sum them up to get
-  /// the memory needed to contain all the digests
-  template <typename S>
-  size_t get_digests_len(uint32_t height, uint32_t arity)
-  {
-    size_t digests_len = 0;
-    size_t row_length = 1;
-    for (int i = 1; i < height; i++) {
-      digests_len += row_length;
-      row_length *= arity;
-    }
-
-    return digests_len;
-  }
-
-  /// Constructs merkle subtree without parallelization
-  /// The digests are aligned sequentially per row
-  /// Example:
-  ///
-  /// Big tree:
-  ///
-  ///        1
-  ///       / \
-  ///      2   3
-  ///     / \ / \
-  ///    4  5 6  7
-  ///
-  /// Subtree 1    Subtree 2
-  ///    2            3
-  ///   / \          / \
-  ///  4   5        6   7
-  ///
-  /// Digests array for subtree 1:
-  /// [4 5 . . 2 . .]
-  /// |   |    |
-  /// -----    V
-  ///   |    Segment (offset = 4, subtree_idx = 0)
-  ///   v
-  /// Segment (offset = 0, subtree_idx = 0)
-  ///
-  /// Digests array for subtree 2:
-  /// [. . 6 7 . 3 .]
-  ///     |   |
-  ///     -----
-  ///       |
-  ///       v
-  ///    Segment (offset = 0, subtree_idx = 1)
-  ///
-  /// Total digests array:
-  /// [4 5 6 7 2 3 .]
-  template <typename S, int T>
-  cudaError_t build_merkle_subtree(
-    S* state,
-    S* digests,
-    size_t subtree_idx,
-    size_t subtree_height,
-    S* big_tree_digests,
-    size_t start_segment_size,
-    size_t start_segment_offset,
-    int keep_rows,
-    const PoseidonConstants<S>& poseidon,
-    cudaStream_t& stream)
-  {
-    int arity = T - 1;
-
-    PoseidonConfig config = default_poseidon_config(T);
-    config.are_inputs_on_device = true;
-    config.are_outputs_on_device = true;
-    config.input_is_a_state = true;
-    config.loop_state = true;
-    config.ctx.stream = stream;
-
-    size_t leaves_size = pow(arity, subtree_height - 1);
-    uint32_t number_of_blocks = leaves_size / arity;
-    size_t segment_size = start_segment_size;
-    size_t segment_offset = start_segment_offset;
-
-    while (number_of_blocks > 0) {
-      cudaError_t poseidon_res = poseidon_hash<S, T>(state, digests, number_of_blocks, poseidon, config);
-      CHK_IF_RETURN(poseidon_res);
-
-      if (!keep_rows || subtree_height <= keep_rows + 1) {
-        S* digests_with_offset = big_tree_digests + segment_offset + subtree_idx * number_of_blocks;
-        CHK_IF_RETURN(
-          cudaMemcpyAsync(digests_with_offset, digests, number_of_blocks * sizeof(S), cudaMemcpyDeviceToHost, stream));
-        segment_offset += segment_size;
-      }
-
-      segment_size /= arity;
-      subtree_height--;
-      number_of_blocks /= arity;
-      config.aligned = true;
-    }
-
-    return CHK_LAST();
-  }
-
-  template <typename S, int T>
-  cudaError_t build_merkle_tree(
-    const S* leaves,
-    S* digests,
-    uint32_t height,
-    const poseidon::PoseidonConstants<S>& poseidon,
-    const TreeBuilderConfig& config)
-  {
-    CHK_INIT_IF_RETURN();
-    cudaStream_t& stream = config.ctx.stream;
-
-    int arity = T - 1;
-    uint32_t number_of_leaves = pow(arity, (height - 1));
-
-    // This will determine how much splitting do we need to do
-    // `number_of_streams` subtrees should fit in the device
-    // This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
-    uint32_t number_of_subtrees = 1;
-    uint32_t subtree_height = height;
-    uint32_t subtree_leaves_size = pow(arity, height - 1);
-    uint32_t subtree_state_size = subtree_leaves_size / arity * T;
-    uint32_t subtree_digests_size = get_digests_len<S>(subtree_height, arity);
-    size_t subtree_memory_required = sizeof(S) * (subtree_state_size + subtree_digests_size);
-    while (subtree_memory_required > STREAM_CHUNK_SIZE) {
-      number_of_subtrees *= arity;
-      subtree_height--;
-      subtree_leaves_size /= arity;
-      subtree_state_size = subtree_leaves_size / arity * T;
-      subtree_digests_size = subtree_state_size / arity;
-      subtree_memory_required = sizeof(S) * (subtree_state_size + subtree_digests_size);
-    }
-    int cap_height = height - subtree_height + 1;
-    size_t caps_len = pow(arity, cap_height - 1);
-
-    size_t available_memory, _total_memory;
-    CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
-    available_memory -= GIGA / 8; // Leave 128 MB
-
-    // We can effectively parallelize memory copy with streams
-    // as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
-    const size_t number_of_streams = std::min((uint32_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
-    cudaStream_t* streams = static_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t) * number_of_streams));
-    for (size_t i = 0; i < number_of_streams; i++) {
-      CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
-    }
-
-#if !defined(__CUDA_ARCH__) && defined(MERKLE_DEBUG)
-    std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
-    std::cout << "Number of streams = " << number_of_streams << std::endl;
-    std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
-    std::cout << "Height of a subtree = " << subtree_height << std::endl;
-    std::cout << "Cutoff height = " << height - subtree_height + 1 << std::endl;
-    std::cout << "Number of leaves in a subtree = " << subtree_leaves_size << std::endl;
-    std::cout << "State of a subtree = " << subtree_state_size << std::endl;
-    std::cout << "Digest elements for a subtree = " << get_digests_len<S>(subtree_height, arity) << std::endl;
-    std::cout << "Size of 1 subtree states = " << subtree_state_size * sizeof(S) / 1024 / 1024 << " MB" << std::endl;
-    std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(S) / 1024 / 1024 << " MB" << std::endl;
-#endif
-
-    // Allocate memory for the leaves and digests
-    // These are shared by streams in a pool
-    S *states_ptr, *digests_ptr;
-    CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_state_size * number_of_streams * sizeof(S), stream))
-    CHK_IF_RETURN(cudaMallocAsync(&digests_ptr, subtree_digests_size * number_of_streams * sizeof(S), stream))
-    // Wait for these allocations to finish
-    CHK_IF_RETURN(cudaStreamSynchronize(stream));
-
-    bool caps_mode = config.keep_rows && config.keep_rows < cap_height;
-    S* caps;
-    if (caps_mode) { caps = static_cast<S*>(malloc(caps_len * sizeof(S))); }
-
-    for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
-      size_t stream_idx = subtree_idx % number_of_streams;
-      cudaStream_t subtree_stream = streams[stream_idx];
-
-      const S* subtree_leaves = leaves + subtree_idx * subtree_leaves_size;
-      S* subtree_state = states_ptr + stream_idx * subtree_state_size;
-      S* subtree_digests = digests_ptr + stream_idx * subtree_digests_size;
-
-      // Copy the first level from RAM / device to device
-      // The pitch property of cudaMemcpy2D resolves shape differences
-      CHK_IF_RETURN(cudaMemcpy2DAsync(
-        subtree_state, T * sizeof(S),      // Device pointer and device pitch
-        subtree_leaves, arity * sizeof(S), // Host pointer and pitch
-        arity * sizeof(S),                 // Size of the source matrix (Arity)
-        subtree_leaves_size / arity,       // Size of the source matrix (Number of blocks)
-        config.are_inputs_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, subtree_stream));
-
-      int subtree_keep_rows = 0;
-      if (config.keep_rows) {
-        int diff = config.keep_rows - cap_height + 1;
-        subtree_keep_rows = diff <= 0 ? 1 : diff;
-      }
-      size_t start_segment_size = number_of_leaves / arity;
-      cudaError_t subtree_result = build_merkle_subtree<S, T>(
-        subtree_state,              // state
-        subtree_digests,            // digests
-        subtree_idx,                // subtree_idx
-        subtree_height,             // subtree_height
-        caps_mode ? caps : digests, // big_tree_digests
-        start_segment_size,         // start_segment_size
-        0,                          // start_segment_offset
-        subtree_keep_rows,          // keep_rows
-        poseidon,                   // hash
-        subtree_stream              // stream
-      );
-      CHK_IF_RETURN(subtree_result);
-    }
-
-    for (size_t i = 0; i < number_of_streams; i++) {
-      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
-    }
-
-    // Finish the top-level tree if any
-    if (cap_height > 1) {
-      size_t start_segment_size = caps_len / arity;
-      size_t start_segment_offset = 0;
-      if (!caps_mode) {
-        size_t layer_size = pow(arity, config.keep_rows - 1);
-        for (int i = 0; i < config.keep_rows - cap_height + 1; i++) {
-          start_segment_offset += layer_size;
-          layer_size /= arity;
-        }
-      }
-      CHK_IF_RETURN(cudaMemcpy2DAsync(
-        states_ptr, T * sizeof(S), caps_mode ? caps : (digests + start_segment_offset - caps_len), arity * sizeof(S),
-        arity * sizeof(S),
-        caps_len / arity,                 // Size of the source
-        cudaMemcpyHostToDevice, stream)); // Direction and stream
-
-      cudaError_t top_tree_result = build_merkle_subtree<S, T>(
-        states_ptr,           // state
-        digests_ptr,          // digests
-        0,                    // subtree_idx
-        cap_height,           // subtree_height
-        digests,              // big_tree_digests
-        start_segment_size,   // start_segment_size
-        start_segment_offset, // start_segment_offset
-        config.keep_rows,     // keep_rows
-        poseidon,             // hash
-        stream                // stream
-      );
-      CHK_IF_RETURN(top_tree_result);
-      if (caps_mode) { free(caps); }
-    }
-
-    CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
-    CHK_IF_RETURN(cudaFreeAsync(digests_ptr, stream));
-    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
-    for (size_t i = 0; i < number_of_streams; i++) {
-      CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
-      CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
-    }
-    free(streams);
-    return CHK_LAST();
-  }
-
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, build_poseidon_merkle_tree)(
-    const scalar_t* leaves,
-    scalar_t* digests,
-    uint32_t height,
-    int arity,
-    PoseidonConstants<scalar_t>& constants,
-    TreeBuilderConfig& config)
-  {
-    switch (arity) {
-    case 2:
-      return build_merkle_tree<scalar_t, 3>(leaves, digests, height, constants, config);
-    case 4:
-      return build_merkle_tree<scalar_t, 5>(leaves, digests, height, constants, config);
-    case 8:
-      return build_merkle_tree<scalar_t, 9>(leaves, digests, height, constants, config);
-    case 11:
-      return build_merkle_tree<scalar_t, 12>(leaves, digests, height, constants, config);
-    default:
-      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "BuildPoseidonMerkleTree: #arity must be one of [2, 4, 8, 11]");
-    }
-    return CHK_LAST();
-  }
-} // namespace merkle
--- a/icicle/src/poseidon2/Makefile
+++ b/icicle/src/poseidon2/Makefile
@@ -1,7 +1,5 @@
-test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
-	nvcc -o test_poseidon -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 -DDEVMODE -DDEBUG extern.cu test.cu
-	./test_poseidon
+test_poseidon: test.cu 
+	nvcc -o test_poseidon -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 extern.cu test.cu

-test_poseidon_release: test.cu poseidon.cu kernels.cu constants.cu
-	nvcc -o test_poseidon_release -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 extern.cu test.cu
-	./test_poseidon_release
+test_poseidon_m31: test_m31.cu 
+	nvcc -o test_poseidon_m31 -I../../include -DFIELD=m31 -DFIELD_ID=1003 extern.cu test_m31.cu
--- a/icicle/src/poseidon2/constants.cu
+++ b/icicle/src/poseidon2/constants.cu
@@ -1,4 +1,5 @@
-#include "poseidon2/poseidon2.cuh"
+#include "poseidon2/constants.cuh"
+#include "gpu-utils/device_context.cuh"

 /// These are pre-calculated constants for different curves
 #include "fields/id.h"
@@ -20,6 +21,9 @@ using namespace poseidon2_constants_grumpkin;
 #elif FIELD_ID == BABY_BEAR
 #include "poseidon2/constants/babybear_poseidon2.h"
 using namespace poseidon2_constants_babybear;
+#elif FIELD_ID == M31
+#include "poseidon2/constants/m31_poseidon2.h"
+using namespace poseidon2_constants_m31;
 #endif

 namespace poseidon2 {
@@ -36,7 +40,6 @@ namespace poseidon2 {
    device_context::DeviceContext& ctx,
    Poseidon2Constants<S>* poseidon_constants)
  {
-    cudaFree(nullptr); // Temporary solution
    if (!(alpha == 3 || alpha == 5 || alpha == 7 || alpha == 11)) {
      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid alpha value");
    }
@@ -78,7 +81,6 @@ namespace poseidon2 {
    device_context::DeviceContext& ctx,
    Poseidon2Constants<S>* poseidon2_constants)
  {
-    cudaFree(nullptr); // Temporary solution
    CHK_INIT_IF_RETURN();

 #define P2_CONSTANTS_DEF(width)                                                                                        \
@@ -121,7 +123,6 @@ namespace poseidon2 {
  cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx)
  {
    CHK_INIT_IF_RETURN();
-    CHK_IF_RETURN(cudaFreeAsync(constants->round_constants, ctx.stream));
    CHK_IF_RETURN(cudaFreeAsync(constants->internal_matrix_diag, ctx.stream));

    constants->alpha = 0;
--- a/icicle/src/poseidon2/extern.cu
+++ b/icicle/src/poseidon2/extern.cu
@@ -3,67 +3,71 @@
 #include "fields/field_config.cuh"
 using namespace field_config;

-#include "poseidon.cu"
+#include "gpu-utils/error_handler.cuh"
+#include "poseidon2/poseidon2.cuh"
+#include "./constants.cu"

 namespace poseidon2 {
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_poseidon2_constants_cuda)(
-    int width,
-    int alpha,
-    int internal_rounds,
-    int external_rounds,
+  template class Poseidon2<scalar_t>;
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_create_cuda)(
+    Poseidon2<scalar_t>** poseidon,
+    unsigned int width,
+    unsigned int rate,
+    unsigned int alpha,
+    unsigned int internal_rounds,
+    unsigned int external_rounds,
    const scalar_t* round_constants,
    const scalar_t* internal_matrix_diag,
    MdsType mds_type,
    DiffusionStrategy diffusion,
-    device_context::DeviceContext& ctx,
-    Poseidon2Constants<scalar_t>* poseidon_constants)
+    device_context::DeviceContext& ctx)
  {
-    return create_poseidon2_constants<scalar_t>(
-      width, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type, diffusion, ctx,
-      poseidon_constants);
+    try {
+      *poseidon = new Poseidon2<scalar_t>(
+        width, rate, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type,
+        diffusion, ctx);
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
+    }
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_poseidon2_constants_cuda)(
-    int width,
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_load_cuda)(
+    Poseidon2<scalar_t>** poseidon,
+    unsigned int width,
+    unsigned int rate,
    MdsType mds_type,
    DiffusionStrategy diffusion,
-    device_context::DeviceContext& ctx,
-    Poseidon2Constants<scalar_t>* constants)
+    device_context::DeviceContext& ctx)
  {
-    return init_poseidon2_constants<scalar_t>(width, mds_type, diffusion, ctx, constants);
-  }
-
-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_hash_cuda)(
-    const scalar_t* input,
-    scalar_t* output,
-    int number_of_states,
-    int width,
-    const Poseidon2Constants<scalar_t>* constants,
-    Poseidon2Config* config)
-  {
-#define P2_HASH_T(width)                                                                                               \
-  case width:                                                                                                          \
-    return poseidon2_hash<scalar_t, width>(input, output, number_of_states, *constants, *config);
-
-    switch (width) {
-      P2_HASH_T(2)
-      P2_HASH_T(3)
-      P2_HASH_T(4)
-      P2_HASH_T(8)
-      P2_HASH_T(12)
-      P2_HASH_T(16)
-      P2_HASH_T(20)
-      P2_HASH_T(24)
-    default:
-      THROW_ICICLE_ERR(
-        IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+    try {
+      *poseidon = new Poseidon2<scalar_t>(width, rate, mds_type, diffusion, ctx);
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
    }
-    return CHK_LAST();
  }

-  extern "C" cudaError_t CONCAT_EXPAND(FIELD, release_poseidon2_constants_cuda)(
-    Poseidon2Constants<scalar_t>* constants, device_context::DeviceContext& ctx)
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_hash_many_cuda)(
+    const Poseidon2<scalar_t>* poseidon,
+    const scalar_t* inputs,
+    scalar_t* output,
+    unsigned int number_of_states,
+    unsigned int input_block_len,
+    unsigned int output_len,
+    hash::SpongeConfig& cfg)
  {
-    return release_poseidon2_constants<scalar_t>(constants, ctx);
+    return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_delete_cuda)(Poseidon2<scalar_t>* poseidon)
+  {
+    try {
+      poseidon->~Poseidon2();
+      return cudaError_t::cudaSuccess;
+    } catch (const IcicleError& _error) {
+      return cudaError_t::cudaErrorUnknown;
+    }
  }
 } // namespace poseidon2
--- a/icicle/src/poseidon2/poseidon.cu
+++ b/icicle/src/poseidon2/poseidon.cu
@@ -1,80 +0,0 @@
-#include "poseidon2/poseidon2.cuh"
-#include "constants.cu"
-#include "kernels.cu"
-
-namespace poseidon2 {
-  static int poseidon_block_size = 128;
-
-  template <typename S, int T>
-  int poseidon_number_of_blocks(size_t number_of_states)
-  {
-    return number_of_states / poseidon_block_size + static_cast<bool>(number_of_states % poseidon_block_size);
-  }
-
-  template <typename S, int T>
-  cudaError_t permute_many(
-    const S* states,
-    S* states_out,
-    size_t number_of_states,
-    const Poseidon2Constants<S>& constants,
-    cudaStream_t& stream)
-  {
-    poseidon2_permutation_kernel<S, T>
-      <<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
-        states, states_out, number_of_states, constants);
-    CHK_IF_RETURN(cudaPeekAtLastError());
-    return CHK_LAST();
-  }
-
-  template <typename S, int T>
-  cudaError_t poseidon2_hash(
-    const S* states,
-    S* output,
-    size_t number_of_states,
-    const Poseidon2Constants<S>& constants,
-    const Poseidon2Config& config)
-  {
-    CHK_INIT_IF_RETURN();
-    cudaStream_t& stream = config.ctx.stream;
-    S* d_states;
-    if (config.are_states_on_device) {
-      d_states = const_cast<S*>(states);
-    } else {
-      // allocate memory for {number_of_states} states of {t} scalars each
-      CHK_IF_RETURN(cudaMallocAsync(&d_states, number_of_states * T * sizeof(S), stream))
-      CHK_IF_RETURN(cudaMemcpyAsync(d_states, states, number_of_states * T * sizeof(S), cudaMemcpyHostToDevice, stream))
-    }
-
-    cudaError_t hash_error = permute_many<S, T>(d_states, d_states, number_of_states, constants, stream);
-    CHK_IF_RETURN(hash_error);
-
-    if (config.mode == PoseidonMode::COMPRESSION) {
-      S* output_device;
-      if (config.are_outputs_on_device) {
-        output_device = output;
-      } else {
-        CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_states * sizeof(S), stream))
-      }
-
-      get_hash_results<S, T><<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
-        d_states, number_of_states, config.output_index, output_device);
-      CHK_IF_RETURN(cudaPeekAtLastError());
-
-      if (!config.are_outputs_on_device) {
-        CHK_IF_RETURN(
-          cudaMemcpyAsync(output, output_device, number_of_states * sizeof(S), cudaMemcpyDeviceToHost, stream));
-        CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
-      }
-    } else {
-      if (!config.are_states_on_device || !config.are_outputs_on_device) {
-        CHK_IF_RETURN(
-          cudaMemcpyAsync(output, d_states, number_of_states * T * sizeof(S), cudaMemcpyDeviceToHost, stream));
-      }
-    }
-
-    if (!config.are_states_on_device) CHK_IF_RETURN(cudaFreeAsync(d_states, stream));
-
-    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
-    return CHK_LAST();
-  }
-} // namespace poseidon2
--- a/icicle/src/poseidon2/test.cu
+++ b/icicle/src/poseidon2/test.cu
--- a/icicle/src/poseidon2/test_m31.cu
+++ b/icicle/src/poseidon2/test_m31.cu
@@ -0,0 +1,88 @@
+#include "gpu-utils/device_context.cuh"
+
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+
+#include "poseidon2/poseidon2.cuh"
+using namespace poseidon2;
+
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+#include "hash/hash.cuh"
+
+#define T 16
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+int main(int argc, char* argv[])
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  // Load poseidon
+  START_TIMER(timer_const);
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  Poseidon2<scalar_t> poseidon(T, T, MdsType::DEFAULT_MDS, DiffusionStrategy::DEFAULT_DIFFUSION, ctx);
+  END_TIMER(timer_const, "Load poseidon constants");
+
+  int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
+  scalar_t* in_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * T * sizeof(scalar_t)));
+  scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
+  scalar_t input = scalar_t::zero();
+
+  hash::SpongeConfig cfg = hash::default_sponge_config();
+
+  size_t number_of_repetitions = argc > 2 ? 1 << atoi(argv[2]) : 32;
+
+  // Prepare input data of [0, 1, 2 ... (number_of_blocks * arity) - 1]
+  for (uint32_t i = 0; i < number_of_blocks * T; i++) {
+    in_ptr[i] = input;
+    input = input + scalar_t::one();
+  }
+
+  // Warm up
+  poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, T, 1, cfg);
+
+  auto total_time_start = std::chrono::high_resolution_clock::now();
+  size_t avg_time = 0;
+  for (int i = 0; i < number_of_repetitions; i++) {
+    auto poseidon_start = std::chrono::high_resolution_clock::now();
+    poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, T, 1, cfg);
+    avg_time += FpMilliseconds(std::chrono::high_resolution_clock::now() - poseidon_start).count();
+  }
+  auto total_time = FpMilliseconds(std::chrono::high_resolution_clock::now() - total_time_start).count();
+
+  std::cout << "Block size: " << number_of_blocks << std::endl;
+  std::cout << "Total time: " << total_time << " ms" << std::endl;
+  std::cout << "Avg time: " << avg_time / number_of_repetitions << " ms" << std::endl;
+
+  // for (int i = 0; i < number_of_blocks; i++) {
+  //   std::cout << "{";
+  //   for (int j = 0; j < 8; j++) {
+  //     std::cout << ((uint32_t*)&out_ptr[i].limbs_storage)[j];
+  //     if (j != 7) { std::cout << ", "; }
+  //   }
+  //   std::cout << "}," << std::endl;
+  // }
+
+  if (number_of_blocks == 1024) {
+    for (int i = 0; i < number_of_blocks; i++) {
+#ifdef DEBUG
+      // std::cout << out_ptr[i] << std::endl;
+#endif
+      // assert((out_ptr[i] == expected[i]));
+    }
+    printf("Expected output matches\n");
+  }
+
+  free(in_ptr);
+  free(out_ptr);
+}
+
+#endif
--- a/icicle/src/poseidon2/test_poseidon_m31
+++ b/icicle/src/poseidon2/test_poseidon_m31
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -165,7 +165,7 @@ namespace vec_ops {
    E* mat_out,
    uint32_t row_size,
    uint32_t column_size,
-    device_context::DeviceContext& ctx,
+    const device_context::DeviceContext& ctx,
    bool on_device,
    bool is_async)
  {
--- a/scripts/gen_c_api.py
+++ b/scripts/gen_c_api.py
@@ -77,6 +77,8 @@ FIELDS_CONFIG = {
 COMMON_INCLUDES = [
    '#include <cuda_runtime.h>',
    '#include "gpu-utils/device_context.cuh"',
+    '#include "merkle-tree/merkle.cuh"',
+    '#include "matrix/matrix.cuh"'
 ]

 WARN_TEXT = """\
@@ -114,10 +116,9 @@ if __name__ == "__main__":
            includes.append('#include "msm/msm.cuh"')
        if any(header.name.startswith("vec_ops") for header in headers):
            includes.append('#include "vec_ops/vec_ops.cuh"')
-        if any(header.name.startswith("poseidon") for header in headers):
+        if any(header.name.startswith("poseidon.h") for header in headers):
            includes.append('#include "poseidon/poseidon.cuh"')
-            includes.append('#include "poseidon/tree/merkle.cuh"')
-        if any(header.name.startswith("poseidon2") for header in headers):
+        if any(header.name.startswith("poseidon2.h") for header in headers):
            includes.append('#include "poseidon2/poseidon2.cuh"')

        contents = WARN_TEXT + INCLUDE_ONCE.format(curve.upper()) + "\n".join(includes) + "\n\n"
@@ -148,10 +149,9 @@ if __name__ == "__main__":
            includes.append('#include "ntt/ntt.cuh"')
        if any(header.name.startswith("vec_ops") for header in headers):
            includes.append('#include "vec_ops/vec_ops.cuh"')
-        if any(header.name.startswith("poseidon") for header in headers):
+        if any(header.name.startswith("poseidon.h") for header in headers):
            includes.append('#include "poseidon/poseidon.cuh"')
-            includes.append('#include "poseidon/tree/merkle.cuh"')
-        if any(header.name.startswith("poseidon2") for header in headers):
+        if any(header.name.startswith("poseidon2.h") for header in headers):
            includes.append('#include "poseidon2/poseidon2.cuh"')

        contents = WARN_TEXT + INCLUDE_ONCE.format(field.upper()) + "\n".join(includes) + "\n\n"
--- a/wrappers/golang/core/poseidon.go
+++ b/wrappers/golang/core/poseidon.go
@@ -1,94 +0,0 @@
-package core
-
-import (
-	"fmt"
-	"unsafe"
-
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-)
-
-type PoseidonConfig struct {
-	/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-	Ctx                cr.DeviceContext
-	areInputsOnDevice  bool
-	areOutputsOnDevice bool
-	///If true, input is considered to be a states vector, holding the preimages in aligned or not aligned format.
-	///Memory under the input pointer will be used for states. If false, fresh states memory will be allocated and input will be copied into it */
-	InputIsAState bool
-	/// If true - input should be already aligned for poseidon permutation.
-	///* Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
-	///* not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D) */
-	Aligned bool
-	///If true, hash results will also be copied in the input pointer in aligned format
-	LoopState bool
-	///Whether to run the Poseidon asynchronously. If set to `true`, the poseidon_hash function will be
-	///non-blocking and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
-	///If set to false, the poseidon_hash function will block the current CPU thread. */
-	IsAsync bool
-}
-
-type PoseidonConstants[T any] struct {
-	Arity           int32
-	PartialRounds   int32
-	FullRoundsHalf  int32
-	RoundConstants  unsafe.Pointer
-	MdsMatrix       unsafe.Pointer
-	NonSparseMatrix unsafe.Pointer
-	SparseMatrices  unsafe.Pointer
-	DomainTag       T
-}
-
-func GetDefaultPoseidonConfig() PoseidonConfig {
-	ctx, _ := cr.GetDefaultDeviceContext()
-	return PoseidonConfig{
-		ctx,   // Ctx
-		false, // areInputsOnDevice
-		false, // areOutputsOnDevice
-		false, // inputIsAState
-		false, // aligned
-		false, // loopState
-		false, // IsAsync
-	}
-}
-
-func PoseidonCheck[T any](input, output HostOrDeviceSlice, cfg *PoseidonConfig, constants *PoseidonConstants[T], numberOfStates int) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer) {
-	inputLen, outputLen := input.Len(), output.Len()
-	arity := int(constants.Arity)
-	expectedInputLen := arity * numberOfStates
-	if cfg.InputIsAState {
-		expectedInputLen += numberOfStates
-	}
-
-	if inputLen != expectedInputLen {
-		errorString := fmt.Sprintf(
-			"input is not the right length for the given parameters: %d, should be: %d",
-			inputLen,
-			arity*numberOfStates,
-		)
-		panic(errorString)
-	}
-
-	if outputLen != numberOfStates {
-		errorString := fmt.Sprintf(
-			"output is not the right length for the given parameters: %d, should be: %d",
-			outputLen,
-			numberOfStates,
-		)
-		panic(errorString)
-	}
-	cfg.areInputsOnDevice = input.IsOnDevice()
-	cfg.areOutputsOnDevice = output.IsOnDevice()
-
-	if input.IsOnDevice() {
-		input.(DeviceSlice).CheckDevice()
-
-	}
-
-	if output.IsOnDevice() {
-		output.(DeviceSlice).CheckDevice()
-	}
-
-	cfgPointer := unsafe.Pointer(cfg)
-
-	return input.AsUnsafePointer(), output.AsUnsafePointer(), cfgPointer
-}
--- a/wrappers/golang/core/sponge.go
+++ b/wrappers/golang/core/sponge.go
@@ -0,0 +1,105 @@
+package core
+
+import (
+	"fmt"
+
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+)
+
+type SpongeConfig struct {
+	/// Details related to the device such as its id and stream.
+	Ctx cr.DeviceContext
+
+	areInputsOnDevice  bool
+	areResultsOnDevice bool
+
+	InputRate  uint32
+	OutputRate uint32
+	Offset     uint32
+
+	/// If true - input should be already aligned for poseidon permutation.
+	/// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
+	/// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
+	RecursiveSqueeze bool
+
+	/// If true, hash results will also be copied in the input pointer in aligned format
+	Aligned bool
+
+	/// Whether to run the SpongeHash asynchronously. If set to `true`, the SpongeHash function will be non-blocking
+	/// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+	/// If set to `false`, the SpongeHash function will block the current CPU thread.
+	IsAsync bool
+}
+
+func GetDefaultSpongeConfig() SpongeConfig {
+	ctx, _ := cr.GetDefaultDeviceContext()
+	return SpongeConfig{
+		ctx,
+		false,
+		false,
+		0,
+		0,
+		0,
+		false,
+		false,
+		false,
+	}
+}
+
+func SpongeInputCheck(inputs HostOrDeviceSlice, numberOfStates, inputBlockLength, inputRate uint32, ctx *cr.DeviceContext) {
+	if inputBlockLength > inputRate {
+		errorString := fmt.Sprintf(
+			"Input block (%d) can't be greater than input rate (%d)",
+			inputBlockLength,
+			inputRate,
+		)
+		panic(errorString)
+	}
+	inputsSizeExpected := inputBlockLength * numberOfStates
+	if inputs.Len() < int(inputsSizeExpected) {
+		errorString := fmt.Sprintf(
+			"inputs len is %d; but needs to be at least %d",
+			inputs.Len(),
+			inputsSizeExpected,
+		)
+		panic(errorString)
+	}
+	if inputs.IsOnDevice() {
+		inputs.(DeviceSlice).CheckDevice()
+	}
+}
+
+func SpongeStatesCheck(states DeviceSlice, numberOfStates, width uint32, ctx *cr.DeviceContext) {
+
+	statesSizeExpected := width * numberOfStates
+	if states.Len() < int(statesSizeExpected) {
+		errorString := fmt.Sprintf(
+			"inputs len is %d; but needs to be at least %d",
+			states.Len(),
+			statesSizeExpected,
+		)
+		panic(errorString)
+	}
+	states.CheckDevice()
+}
+
+func SpongeOutputsCheck(outputs HostOrDeviceSlice, numberOfStates, outputLen, width uint32, recursive bool, ctx *cr.DeviceContext) {
+	var outputsSizeExpected uint32
+	if recursive {
+		outputsSizeExpected = width * numberOfStates
+	} else {
+		outputsSizeExpected = outputLen * numberOfStates
+	}
+
+	if outputs.Len() < int(outputsSizeExpected) {
+		errorString := fmt.Sprintf(
+			"outputs len is %d; but needs to be at least %d",
+			outputs.Len(),
+			outputsSizeExpected,
+		)
+		panic(errorString)
+	}
+	if outputs.IsOnDevice() {
+		outputs.(DeviceSlice).CheckDevice()
+	}
+}
--- a/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bls12377/poseidon/include/poseidon.h
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t bls12_377_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t bls12_377_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t bls12_377_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t bls12_377_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t bls12_377_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12377/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bls12377/poseidon/poseidon.go
@@ -3,55 +3,85 @@ package poseidon
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bls12_377.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bls12_377_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bls12_377_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.bls12_377_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.bls12_377_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.bls12_377_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.bls12_377_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.bls12_377_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/curves/bls12377/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bls12377/tests/poseidon_test.go
@@ -7,6 +7,7 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/poseidon"
+	"github.com/stretchr/testify/assert"
 )

 func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[bls12_377.ScalarField]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := bls12_377.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

 	output := make(core.HostSlice[bls12_377.ScalarField], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/golang/curves/bls12381/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bls12381/poseidon/include/poseidon.h
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t bls12_381_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t bls12_381_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t bls12_381_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t bls12_381_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t bls12_381_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bls12381/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bls12381/poseidon/poseidon.go
@@ -3,55 +3,85 @@ package poseidon
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bls12_381 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bls12_381.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bls12_381_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bls12_381_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.bls12_381_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.bls12_381_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.bls12_381_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.bls12_381_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.bls12_381_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/curves/bls12381/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bls12381/tests/poseidon_test.go
@@ -7,29 +7,19 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	bls12_381 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381/poseidon"
-
-	"fmt"
 	"github.com/stretchr/testify/assert"
 )

-func formatOutput(x bls12_381.ScalarField) string {
-	r := x.GetLimbs()
-	return fmt.Sprintf("%08x%08x%08x%08x%08x%08x%08x%08x", r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0])
-}
-
 func TestPoseidon(t *testing.T) {

 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[bls12_381.ScalarField]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := bls12_381.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -38,18 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

 	output := make(core.HostSlice[bls12_381.ScalarField], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
-	expectedString := "48fe0b1331196f6cdb33a7c6e5af61b76fd388e1ef1d3d418be5147f0e4613d4" //This result is from https://github.com/triplewz/poseidon
-	outputString := formatOutput(output[0])
-
-	assert.Equal(t, outputString, expectedString, "Poseidon hash does not match expected result")
-
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/golang/curves/bn254/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bn254/poseidon/include/poseidon.h
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t bn254_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t bn254_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t bn254_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t bn254_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t bn254_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t bn254_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t bn254_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bn254/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bn254/poseidon/poseidon.go
@@ -3,55 +3,85 @@ package poseidon
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bn254.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bn254_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bn254_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.bn254_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.bn254_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.bn254_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.bn254_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.bn254_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/curves/bn254/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bn254/tests/poseidon_test.go
@@ -7,6 +7,7 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/poseidon"
+	"github.com/stretchr/testify/assert"
 )

 func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[bn254.ScalarField]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := bn254.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

 	output := make(core.HostSlice[bn254.ScalarField], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/golang/curves/bw6761/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/bw6761/poseidon/include/poseidon.h
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t bw6_761_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t bw6_761_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t bw6_761_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t bw6_761_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t bw6_761_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/bw6761/poseidon/poseidon.go
+++ b/wrappers/golang/curves/bw6761/poseidon/poseidon.go
@@ -3,55 +3,85 @@ package poseidon
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bw6_761 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bw6_761.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bw6_761_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.bw6_761_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.bw6_761_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.bw6_761_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.bw6_761_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.bw6_761_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.bw6_761_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/curves/bw6761/tests/poseidon_test.go
+++ b/wrappers/golang/curves/bw6761/tests/poseidon_test.go
@@ -7,6 +7,7 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	bw6_761 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761/poseidon"
+	"github.com/stretchr/testify/assert"
 )

 func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[bw6_761.ScalarField]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := bw6_761.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

 	output := make(core.HostSlice[bw6_761.ScalarField], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/golang/curves/grumpkin/poseidon/include/poseidon.h
+++ b/wrappers/golang/curves/grumpkin/poseidon/include/poseidon.h
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t grumpkin_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t grumpkin_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t grumpkin_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t grumpkin_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t grumpkin_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/curves/grumpkin/poseidon/poseidon.go
+++ b/wrappers/golang/curves/grumpkin/poseidon/poseidon.go
@@ -3,55 +3,85 @@ package poseidon
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	grumpkin "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag grumpkin.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.grumpkin_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.grumpkin_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.grumpkin_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.grumpkin_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.grumpkin_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.grumpkin_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.grumpkin_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/curves/grumpkin/tests/poseidon_test.go
+++ b/wrappers/golang/curves/grumpkin/tests/poseidon_test.go
@@ -7,6 +7,7 @@ import (
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	grumpkin "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin/poseidon"
+	"github.com/stretchr/testify/assert"
 )

 func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[grumpkin.ScalarField]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := grumpkin.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

 	output := make(core.HostSlice[grumpkin.ScalarField], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon.go.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon.go.tmpl
@@ -3,55 +3,85 @@ package {{.PackageName}}
 // #cgo CFLAGS: -I./include/
 // #include "poseidon.h"
 import "C"
-
 import (
+	"runtime"
 	"unsafe"

 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	{{.Field}} "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}"
 )

-func GetDefaultPoseidonConfig() core.PoseidonConfig {
-	return core.GetDefaultPoseidonConfig()
+type PoseidonHandler = C.struct_PoseidonInst
+type Poseidon struct {
+	width  uint32
+	handle *PoseidonHandler
 }

-func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
-	scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
+func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag {{.Field}}.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cAlpha := (C.uint)(alpha)
+	cFullRoundsHalf := (C.uint)(fullRoundsHalf)
+	cPartialRounds := (C.uint)(partialRounds)
+	cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
+	cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
+	cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
+	cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
+	cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.{{.Field}}_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	cScalars := (*C.scalar_t)(scalarsPointer)
-	cResults := (*C.scalar_t)(resultsPointer)
-	cNumberOfStates := (C.int)(numberOfStates)
-	cArity := (C.int)(constants.Arity)
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-	cCfg := (*C.PoseidonConfig)(cfgPointer)
+func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
+	var poseidon *PoseidonHandler
+	cArity := (C.uint)(arity)
+	cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
+	__ret := C.{{.Field}}_poseidon_load_cuda(&poseidon, cArity, cCtx)
+	err := core.FromCudaError((cr.CudaError)(__ret))
+	if err.IcicleErrorCode != core.IcicleSuccess {
+		return nil, err
+	}
+	p := Poseidon{handle: poseidon, width: arity + 1}
+	runtime.SetFinalizer(&p, func(p *Poseidon) {
+		p.Delete()
+	})
+	return &p, err
+}

-	__ret := C.{{.Field}}_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
+func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
+	core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
+	core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)

+	cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
+	cOutput := (*C.scalar_t)(output.AsUnsafePointer())
+	cNumberOfStates := (C.uint)(numberOfStates)
+	cInputBlockLen := (C.uint)(inputBlockLen)
+	cOutputLen := (C.uint)(outputLen)
+	cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
+	__ret := C.{{.Field}}_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
-	cPartialRounds := (C.int)(partialRounds)
-	cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
-
-	__ret := C.{{.Field}}_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
+func (poseidon *Poseidon) Delete() core.IcicleError {
+	__ret := C.{{.Field}}_poseidon_delete_cuda(poseidon.handle)
 	err := (cr.CudaError)(__ret)
 	return core.FromCudaError(err)
 }

-func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
-
-	cArity := (C.int)(arity)
-	cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
-	cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
-
-	__ret := C.{{.Field}}_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
-	err := (cr.CudaError)(__ret)
-	return core.FromCudaError(err)
+func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
+	cfg := core.GetDefaultSpongeConfig()
+	cfg.InputRate = poseidon.width - 1
+	cfg.OutputRate = poseidon.width
+	return cfg
 }
--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon.h.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon.h.tmpl
@@ -9,14 +9,40 @@ extern "C" {
 #endif

 typedef struct scalar_t scalar_t;
-typedef struct PoseidonConfig PoseidonConfig;
 typedef struct DeviceContext DeviceContext;
-typedef struct PoseidonConstants PoseidonConstants;
+typedef struct TreeBuilderConfig TreeBuilderConfig;
+typedef struct PoseidonInst PoseidonInst;
+typedef struct SpongeConfig SpongeConfig;


-cudaError_t {{.Field}}_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
-cudaError_t {{.Field}}_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
-cudaError_t {{.Field}}_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
+cudaError_t {{.Field}}_poseidon_create_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  unsigned int alpha,
+  unsigned int partial_rounds,
+  unsigned int full_rounds_half,
+  const scalar_t* round_constants,
+  const scalar_t* mds_matrix,
+  const scalar_t* non_sparse_matrix,
+  const scalar_t* sparse_matrices,
+  const scalar_t* domain_tag,
+  DeviceContext* ctx);
+
+cudaError_t {{.Field}}_poseidon_load_cuda(
+  PoseidonInst** poseidon,
+  unsigned int arity,
+  DeviceContext* ctx);
+
+cudaError_t {{.Field}}_poseidon_hash_many_cuda(
+  const PoseidonInst* poseidon,
+  const scalar_t* inputs,
+  scalar_t* output,
+  unsigned int number_of_states,
+  unsigned int input_block_len,
+  unsigned int output_len,
+  SpongeConfig* cfg);
+
+cudaError_t {{.Field}}_poseidon_delete_cuda(PoseidonInst* poseidon);

 #ifdef __cplusplus
 }
--- a/wrappers/golang/internal/generator/poseidon/templates/poseidon_test.go.tmpl
+++ b/wrappers/golang/internal/generator/poseidon/templates/poseidon_test.go.tmpl
@@ -2,37 +2,24 @@ package tests

 import (
 	"testing"
-	
+
 	core "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	{{.Field}} "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}"
 	poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}/poseidon"
-
-	{{if eq .Field "bls12_381"}}
-	"fmt"
 	"github.com/stretchr/testify/assert"
-	{{end}}
 )
-{{if eq .Field "bls12_381"}}
-func formatOutput(x {{.Field}}.{{.FieldPrefix}}Field) string {
-	r := x.GetLimbs()
-	return fmt.Sprintf("%08x%08x%08x%08x%08x%08x%08x%08x", r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0])
-}
-{{end}}

 func TestPoseidon(t *testing.T) {

 	arity := 2
 	numberOfStates := 1

-	cfg := poseidon.GetDefaultPoseidonConfig()
-	cfg.IsAsync = true
-	stream, _ := cr.CreateStream()
-	cfg.Ctx.Stream = &stream
+	ctx, _ := cr.GetDefaultDeviceContext()
+	p, err := poseidon.Load(uint32(arity), &ctx)
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	var constants core.PoseidonConstants[{{.Field}}.{{.FieldPrefix}}Field]
-
-	poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
+	cfg := p.GetDefaultSpongeConfig()

 	scalars := {{.Field}}.GenerateScalars(numberOfStates * arity)
 	scalars[0] = scalars[0].Zero()
@@ -41,19 +28,13 @@ func TestPoseidon(t *testing.T) {
 	scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])

 	var deviceInput core.DeviceSlice
-	scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
+	scalarsCopy.CopyToDevice(&deviceInput, true)
 	var deviceOutput core.DeviceSlice
-	deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
+	deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())

-	poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
+	err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
+	assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)

-	output := make(core.HostSlice[{{.Field}}.{{.FieldPrefix}}Field], numberOfStates)
-	output.CopyFromDeviceAsync(&deviceOutput, stream)
-
-	{{if eq .Field "bls12_381"}}
-	expectedString := "48fe0b1331196f6cdb33a7c6e5af61b76fd388e1ef1d3d418be5147f0e4613d4" //This result is from https://github.com/triplewz/poseidon
-	outputString := formatOutput(output[0])
-
-	assert.Equal(t, outputString, expectedString, "Poseidon hash does not match expected result")
-	{{end}}
+	output := make(core.HostSlice[{{.Field}}.ScalarField], numberOfStates)
+	output.CopyFromDevice(&deviceOutput)
 }
--- a/wrappers/rust/icicle-core/src/hash.rs
+++ b/wrappers/rust/icicle-core/src/hash.rs
@@ -0,0 +1,136 @@
+use std::ffi::c_void;
+
+use icicle_cuda_runtime::{
+    device::check_device,
+    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
+    memory::HostOrDeviceSlice,
+};
+
+use crate::ntt::IcicleResult;
+
+/// Struct that encodes Sponge hash parameters.
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub struct SpongeConfig<'a> {
+    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+    pub ctx: DeviceContext<'a>,
+    pub(crate) are_inputs_on_device: bool,
+    pub(crate) are_outputs_on_device: bool,
+    pub input_rate: u32,
+    pub output_rate: u32,
+    pub offset: u32,
+
+    /// If true - input should be already aligned for poseidon permutation.
+    /// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
+    /// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
+    pub recursive_squeeze: bool,
+
+    /// If true, hash results will also be copied in the input pointer in aligned format
+    pub aligned: bool,
+    /// Whether to run the sponge operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
+    /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
+    pub is_async: bool,
+}
+
+impl<'a> Default for SpongeConfig<'a> {
+    fn default() -> Self {
+        Self::default_for_device(DEFAULT_DEVICE_ID)
+    }
+}
+
+impl<'a> SpongeConfig<'a> {
+    pub(crate) fn default_for_device(device_id: usize) -> Self {
+        SpongeConfig {
+            ctx: DeviceContext::default_for_device(device_id),
+            are_inputs_on_device: false,
+            are_outputs_on_device: false,
+            input_rate: 0,
+            output_rate: 0,
+            offset: 0,
+            recursive_squeeze: false,
+            aligned: false,
+            is_async: false,
+        }
+    }
+}
+
+pub trait SpongeHash<PreImage, Image> {
+    fn hash_many(
+        &self,
+        inputs: &(impl HostOrDeviceSlice<PreImage> + ?Sized),
+        output: &mut (impl HostOrDeviceSlice<Image> + ?Sized),
+        number_of_states: usize,
+        input_block_len: usize,
+        output_len: usize,
+        cfg: &SpongeConfig,
+    ) -> IcicleResult<()>;
+
+    fn default_config<'a>(&self) -> SpongeConfig<'a>;
+
+    fn get_handle(&self) -> *const c_void;
+}
+
+pub(crate) fn sponge_check_input<T>(
+    inputs: &(impl HostOrDeviceSlice<T> + ?Sized),
+    number_of_states: usize,
+    input_block_len: usize,
+    input_rate: usize,
+    ctx: &DeviceContext,
+) {
+    if input_block_len > input_rate {
+        panic!(
+            "input block len ({}) can't be greater than input rate ({})",
+            input_block_len, input_rate
+        );
+    }
+
+    let inputs_size_expected = input_block_len * number_of_states;
+    if inputs.len() < inputs_size_expected {
+        panic!(
+            "inputs len is {}; but needs to be at least {}",
+            inputs.len(),
+            inputs_size_expected,
+        );
+    }
+
+    let ctx_device_id = ctx.device_id;
+    if let Some(device_id) = inputs.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in inputs and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+}
+
+pub(crate) fn sponge_check_outputs<T>(
+    outputs: &(impl HostOrDeviceSlice<T> + ?Sized),
+    number_of_states: usize,
+    output_len: usize,
+    width: usize,
+    recursive: bool,
+    ctx: &DeviceContext,
+) {
+    let outputs_size_expected = if recursive {
+        width * number_of_states
+    } else {
+        output_len * number_of_states
+    };
+
+    if outputs.len() < outputs_size_expected {
+        panic!(
+            "outputs len is {}; but needs to be at least {}",
+            outputs.len(),
+            outputs_size_expected,
+        );
+    }
+
+    let ctx_device_id = ctx.device_id;
+    if let Some(device_id) = outputs.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in outputs and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+}
--- a/wrappers/rust/icicle-core/src/lib.rs
+++ b/wrappers/rust/icicle-core/src/lib.rs
@@ -1,7 +1,10 @@
+use std::ffi::c_void;
+
 pub mod curve;
 pub mod ecntt;
 pub mod error;
 pub mod field;
+pub mod hash;
 pub mod msm;
 pub mod ntt;
 pub mod polynomials;
@@ -18,3 +21,11 @@ where
    <Self::ScalarField as traits::FieldImpl>::Config: ntt::NTT<Self::ScalarField, Self::ScalarField>,
 {
 }
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct Matrix {
+    pub values: *const c_void,
+    pub width: usize,
+    pub height: usize,
+}
--- a/wrappers/rust/icicle-core/src/poseidon/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon/mod.rs
@@ -1,212 +1,157 @@
 #[doc(hidden)]
 pub mod tests;

-use icicle_cuda_runtime::{
-    device::check_device,
-    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
-    memory::{DeviceSlice, HostOrDeviceSlice},
+use std::{ffi::c_void, marker::PhantomData};
+
+use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice};
+
+use crate::{
+    error::IcicleResult,
+    hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
+    traits::FieldImpl,
 };

-use crate::{error::IcicleResult, traits::FieldImpl};
-
-#[repr(C)]
-pub struct PoseidonConstants<'a, F: FieldImpl> {
-    arity: u32,
-
-    partial_rounds: u32,
-
-    full_rounds_half: u32,
-
-    /// These should be pointers to data allocated on device
-    round_constants: &'a DeviceSlice<F>,
-    mds_matrix: &'a DeviceSlice<F>,
-    non_sparse_matrix: &'a DeviceSlice<F>,
-    sparse_matrices: &'a DeviceSlice<F>,
-
-    /// Domain tag is the first element in the Poseidon state.
-    /// For the Merkle tree mode it should equal 2^arity - 1
-    domain_tag: F,
+pub type PoseidonHandle = *const c_void;
+pub struct Poseidon<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
+{
+    width: usize,
+    handle: PoseidonHandle,
+    phantom: PhantomData<F>,
 }

-/// Struct that encodes Poseidon parameters to be passed into the [poseidon_hash_many](poseidon_hash_many) function.
-#[repr(C)]
-#[derive(Debug, Clone)]
-pub struct PoseidonConfig<'a> {
-    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-    pub ctx: DeviceContext<'a>,
-
-    are_inputs_on_device: bool,
-
-    are_outputs_on_device: bool,
-
-    /// If true, input is considered to be a states vector, holding the preimages
-    /// in aligned or not aligned format. Memory under the input pointer will be used for states
-    /// If false, fresh states memory will be allocated and input will be copied into it
-    pub input_is_a_state: bool,
-
-    /// If true - input should be already aligned for poseidon permutation.
-    /// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
-    /// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
-    pub aligned: bool,
-
-    /// If true, hash results will also be copied in the input pointer in aligned format
-    pub loop_state: bool,
-
-    /// Whether to run Poseidon asynchronously. If set to `true`, Poseidon will be non-blocking
-    /// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
-    /// If set to `false`, Poseidon will block the current CPU thread.
-    pub is_async: bool,
-}
-
-impl<'a> Default for PoseidonConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
+impl<F> Poseidon<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
+{
+    pub fn load(arity: usize, ctx: &DeviceContext) -> IcicleResult<Self> {
+        <<F as FieldImpl>::Config as PoseidonImpl<F>>::load(arity as u32, ctx).and_then(|handle| {
+            Ok(Self {
+                width: arity + 1,
+                handle,
+                phantom: PhantomData,
+            })
+        })
    }
-}

-impl<'a> PoseidonConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        Self {
-            ctx: DeviceContext::default_for_device(device_id),
-            are_inputs_on_device: false,
-            are_outputs_on_device: false,
-            input_is_a_state: false,
-            aligned: false,
-            loop_state: false,
-            is_async: false,
-        }
-    }
-}
-
-pub trait Poseidon<F: FieldImpl> {
-    fn create_optimized_constants<'a>(
-        arity: u32,
+    pub fn new(
+        arity: usize,
+        alpha: u32,
        full_rounds_half: u32,
        partial_rounds: u32,
-        constants: &mut [F],
+        round_constants: &[F],
+        mds_matrix: &[F],
+        non_sparse_matrix: &[F],
+        sparse_matrices: &[F],
+        domain_tag: F,
        ctx: &DeviceContext,
-    ) -> IcicleResult<PoseidonConstants<'a, F>>;
-    fn load_optimized_constants<'a>(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonConstants<'a, F>>;
-    fn poseidon_unchecked(
-        input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    ) -> IcicleResult<Self> {
+        <<F as FieldImpl>::Config as PoseidonImpl<F>>::create(
+            arity as u32,
+            alpha,
+            full_rounds_half,
+            partial_rounds,
+            round_constants,
+            mds_matrix,
+            non_sparse_matrix,
+            sparse_matrices,
+            domain_tag,
+            ctx,
+        )
+        .and_then(|handle| {
+            Ok(Self {
+                width: arity + 1,
+                handle,
+                phantom: PhantomData,
+            })
+        })
+    }
+}
+
+impl<F> SpongeHash<F, F> for Poseidon<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
+{
+    fn get_handle(&self) -> *const c_void {
+        self.handle
+    }
+
+    fn hash_many(
+        &self,
+        inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        number_of_states: usize,
+        input_block_len: usize,
+        output_len: usize,
+        cfg: &SpongeConfig,
+    ) -> IcicleResult<()> {
+        sponge_check_input(inputs, number_of_states, input_block_len, self.width - 1, &cfg.ctx);
+        sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);
+
+        let mut local_cfg = cfg.clone();
+        local_cfg.are_inputs_on_device = inputs.is_on_device();
+        local_cfg.are_outputs_on_device = output.is_on_device();
+
+        <<F as FieldImpl>::Config as PoseidonImpl<F>>::hash_many(
+            inputs,
+            output,
+            number_of_states as u32,
+            input_block_len as u32,
+            output_len as u32,
+            self.handle,
+            &local_cfg,
+        )
+    }
+
+    fn default_config<'a>(&self) -> SpongeConfig<'a> {
+        let mut cfg = SpongeConfig::default();
+        cfg.input_rate = self.width as u32 - 1;
+        cfg.output_rate = self.width as u32;
+        cfg
+    }
+}
+
+impl<F> Drop for Poseidon<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
+{
+    fn drop(&mut self) {
+        <<F as FieldImpl>::Config as PoseidonImpl<F>>::delete(self.handle).unwrap();
+    }
+}
+
+pub trait PoseidonImpl<F: FieldImpl> {
+    fn create(
+        arity: u32,
+        alpha: u32,
+        full_rounds_half: u32,
+        partial_rounds: u32,
+        round_constants: &[F],
+        mds_matrix: &[F],
+        non_sparse_matrix: &[F],
+        sparse_matrices: &[F],
+        domain_tag: F,
+        ctx: &DeviceContext,
+    ) -> IcicleResult<PoseidonHandle>;
+
+    fn load(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonHandle>;
+
+    fn hash_many(
+        inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
        number_of_states: u32,
-        arity: u32,
-        constants: &PoseidonConstants<F>,
-        config: &PoseidonConfig,
+        input_block_len: u32,
+        output_len: u32,
+        poseidon: PoseidonHandle,
+        cfg: &SpongeConfig,
    ) -> IcicleResult<()>;
-}

-/// Loads pre-calculated poseidon constants on the GPU.
-pub fn load_optimized_poseidon_constants<'a, F>(
-    arity: u32,
-    ctx: &DeviceContext,
-) -> IcicleResult<PoseidonConstants<'a, F>>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon<F>,
-{
-    <<F as FieldImpl>::Config as Poseidon<F>>::load_optimized_constants(arity, ctx)
-}
-
-/// Creates new instance of poseidon constants on the GPU.
-pub fn create_optimized_poseidon_constants<'a, F>(
-    arity: u32,
-    ctx: &DeviceContext,
-    full_rounds_half: u32,
-    partial_rounds: u32,
-    constants: &mut [F],
-) -> IcicleResult<PoseidonConstants<'a, F>>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon<F>,
-{
-    <<F as FieldImpl>::Config as Poseidon<F>>::create_optimized_constants(
-        arity,
-        full_rounds_half,
-        partial_rounds,
-        constants,
-        ctx,
-    )
-}
-
-/// Computes the poseidon hashes for multiple preimages.
-///
-/// # Arguments
-///
-/// * `input` - a pointer to the input data. May point to a vector of preimages or a vector of states filled with preimages.
-///
-/// * `output` - a pointer to the output data. Must be at least of size [number_of_states](number_of_states)
-///
-/// * `number_of_states` - number of input blocks of size `arity`
-///
-/// * `arity` - the arity of the hash function (the size of 1 preimage)
-///
-/// * `constants` - Poseidon constants.
-///
-/// * `config` - config used to specify extra arguments of the Poseidon.
-pub fn poseidon_hash_many<F>(
-    input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
-    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
-    number_of_states: u32,
-    arity: u32,
-    constants: &PoseidonConstants<F>,
-    config: &PoseidonConfig,
-) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon<F>,
-{
-    let input_len_required = if config.input_is_a_state {
-        number_of_states * (arity + 1)
-    } else {
-        number_of_states * arity
-    };
-
-    if input.len() < input_len_required as usize {
-        panic!(
-            "input len is {}; but needs to be at least {}",
-            input.len(),
-            input_len_required
-        );
-    }
-
-    if output.len() < number_of_states as usize {
-        panic!(
-            "output len is {}; but needs to be at least {}",
-            output.len(),
-            number_of_states
-        );
-    }
-
-    let ctx_device_id = config
-        .ctx
-        .device_id;
-    if let Some(device_id) = input.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in input and context are different"
-        );
-    }
-    if let Some(device_id) = output.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in output and context are different"
-        );
-    }
-    check_device(ctx_device_id);
-    let mut local_cfg = config.clone();
-    local_cfg.are_inputs_on_device = input.is_on_device();
-    local_cfg.are_outputs_on_device = output.is_on_device();
-
-    <<F as FieldImpl>::Config as Poseidon<F>>::poseidon_unchecked(
-        input,
-        output,
-        number_of_states,
-        arity,
-        constants,
-        &local_cfg,
-    )
+    fn delete(poseidon: PoseidonHandle) -> IcicleResult<()>;
 }

 #[macro_export]
@@ -218,91 +163,110 @@ macro_rules! impl_poseidon {
      $field_config:ident
    ) => {
        mod $field_prefix_ident {
-            use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, PoseidonConfig, PoseidonConstants};
+            use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, PoseidonHandle, SpongeConfig};
            extern "C" {
-                #[link_name = concat!($field_prefix, "_create_optimized_poseidon_constants_cuda")]
-                pub(crate) fn _create_optimized_constants(
+                #[link_name = concat!($field_prefix, "_poseidon_create_cuda")]
+                pub(crate) fn create(
+                    poseidon: *mut PoseidonHandle,
                    arity: u32,
+                    alpha: u32,
                    full_rounds_half: u32,
                    partial_rounds: u32,
-                    constants: *mut $field,
+                    round_constants: *const $field,
+                    mds_matrix: *const $field,
+                    non_sparse_matrix: *const $field,
+                    sparse_matrices: *const $field,
+                    domain_tag: $field,
                    ctx: &DeviceContext,
-                    poseidon_constants: *mut PoseidonConstants<$field>,
                ) -> CudaError;

-                #[link_name = concat!($field_prefix, "_init_optimized_poseidon_constants_cuda")]
-                pub(crate) fn _load_optimized_constants(
-                    arity: u32,
-                    ctx: &DeviceContext,
-                    constants: *mut PoseidonConstants<$field>,
-                ) -> CudaError;
+                #[link_name = concat!($field_prefix, "_poseidon_load_cuda")]
+                pub(crate) fn load(poseidon: *mut PoseidonHandle, arity: u32, ctx: &DeviceContext) -> CudaError;

-                #[link_name = concat!($field_prefix, "_poseidon_hash_cuda")]
+                #[link_name = concat!($field_prefix, "_poseidon_delete_cuda")]
+                pub(crate) fn delete(poseidon: PoseidonHandle) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "_poseidon_hash_many_cuda")]
                pub(crate) fn hash_many(
-                    input: *mut $field,
+                    poseidon: PoseidonHandle,
+                    inputs: *const $field,
                    output: *mut $field,
                    number_of_states: u32,
-                    arity: u32,
-                    constants: &PoseidonConstants<$field>,
-                    config: &PoseidonConfig,
+                    input_block_len: u32,
+                    output_len: u32,
+                    cfg: &SpongeConfig,
                ) -> CudaError;
            }
        }

-        impl Poseidon<$field> for $field_config {
-            fn create_optimized_constants<'a>(
+        impl PoseidonImpl<$field> for $field_config {
+            fn create(
                arity: u32,
+                alpha: u32,
                full_rounds_half: u32,
                partial_rounds: u32,
-                constants: &mut [$field],
+                round_constants: &[$field],
+                mds_matrix: &[$field],
+                non_sparse_matrix: &[$field],
+                sparse_matrices: &[$field],
+                domain_tag: $field,
                ctx: &DeviceContext,
-            ) -> IcicleResult<PoseidonConstants<'a, $field>> {
+            ) -> IcicleResult<PoseidonHandle> {
                unsafe {
-                    let mut poseidon_constants = MaybeUninit::<PoseidonConstants<'a, $field>>::uninit();
-                    let err = $field_prefix_ident::_create_optimized_constants(
+                    let mut poseidon = MaybeUninit::<PoseidonHandle>::uninit();
+                    $field_prefix_ident::create(
+                        poseidon.as_mut_ptr(),
                        arity,
+                        alpha,
                        full_rounds_half,
                        partial_rounds,
-                        constants as *mut _ as *mut $field,
+                        round_constants as *const _ as *const $field,
+                        mds_matrix as *const _ as *const $field,
+                        non_sparse_matrix as *const _ as *const $field,
+                        sparse_matrices as *const _ as *const $field,
+                        domain_tag,
                        ctx,
-                        poseidon_constants.as_mut_ptr(),
                    )
-                    .wrap();
-                    err.and(Ok(poseidon_constants.assume_init()))
+                    .wrap()
+                    .and(Ok(poseidon.assume_init()))
                }
            }

-            fn load_optimized_constants<'a>(
-                arity: u32,
-                ctx: &DeviceContext,
-            ) -> IcicleResult<PoseidonConstants<'a, $field>> {
+            fn load(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonHandle> {
                unsafe {
-                    let mut constants = MaybeUninit::<PoseidonConstants<'a, $field>>::uninit();
-                    let err = $field_prefix_ident::_load_optimized_constants(arity, ctx, constants.as_mut_ptr()).wrap();
-                    err.and(Ok(constants.assume_init()))
+                    let mut poseidon = MaybeUninit::<PoseidonHandle>::uninit();
+                    $field_prefix_ident::load(poseidon.as_mut_ptr(), arity, ctx)
+                        .wrap()
+                        .and(Ok(poseidon.assume_init()))
                }
            }

-            fn poseidon_unchecked(
-                input: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+            fn hash_many(
+                inputs: &(impl HostOrDeviceSlice<$field> + ?Sized),
                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                number_of_states: u32,
-                arity: u32,
-                constants: &PoseidonConstants<$field>,
-                config: &PoseidonConfig,
+                input_block_len: u32,
+                output_len: u32,
+                poseidon: PoseidonHandle,
+                cfg: &SpongeConfig,
            ) -> IcicleResult<()> {
                unsafe {
                    $field_prefix_ident::hash_many(
-                        input.as_mut_ptr(),
+                        poseidon,
+                        inputs.as_ptr(),
                        output.as_mut_ptr(),
                        number_of_states,
-                        arity,
-                        constants,
-                        config,
+                        input_block_len,
+                        output_len,
+                        cfg,
                    )
                    .wrap()
                }
            }
+
+            fn delete(poseidon: PoseidonHandle) -> IcicleResult<()> {
+                unsafe { $field_prefix_ident::delete(poseidon).wrap() }
+            }
        }
    };
 }
@@ -318,18 +282,3 @@ macro_rules! impl_poseidon_tests {
        }
    };
 }
-
-#[macro_export]
-macro_rules! impl_poseidon_custom_config_test {
-    (
-      $field:ident,
-      $field_bytes:literal,
-      $field_prefix:literal,
-      $partial_rounds:literal
-    ) => {
-        #[test]
-        fn test_poseidon_custom_config() {
-            check_poseidon_custom_config::<$field>($field_bytes, $field_prefix, $partial_rounds)
-        }
-    };
-}
--- a/wrappers/rust/icicle-core/src/poseidon/tests.rs
+++ b/wrappers/rust/icicle-core/src/poseidon/tests.rs
@@ -1,105 +1,48 @@
+use crate::hash::SpongeHash;
 use crate::traits::FieldImpl;
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::memory::{HostOrDeviceSlice, HostSlice};

-use std::io::Read;
-use std::path::PathBuf;
-use std::{env, fs::File};
+use super::{Poseidon, PoseidonImpl};

-use super::{
-    create_optimized_poseidon_constants, load_optimized_poseidon_constants, poseidon_hash_many, Poseidon,
-    PoseidonConfig, PoseidonConstants,
-};
-
-pub fn init_poseidon<'a, F: FieldImpl>(arity: u32) -> PoseidonConstants<'a, F>
+pub fn init_poseidon<F: FieldImpl>(arity: usize) -> Poseidon<F>
 where
-    <F as FieldImpl>::Config: Poseidon<F>,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
 {
    let ctx = DeviceContext::default();
-
-    load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
+    Poseidon::load(arity, &ctx).unwrap()
 }

-pub fn _check_poseidon_hash_many<F: FieldImpl>(constants: PoseidonConstants<F>) -> (F, F)
+pub fn _check_poseidon_hash_many<F: FieldImpl>(poseidon: Poseidon<F>)
 where
-    <F as FieldImpl>::Config: Poseidon<F>,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
 {
    let test_size = 1 << 10;
-    let arity = 2u32;
-    let mut inputs = vec![F::one(); test_size * arity as usize];
+    let arity = poseidon.width - 1;
+    let mut inputs = vec![F::one(); test_size * arity];
    let mut outputs = vec![F::zero(); test_size];

    let input_slice = HostSlice::from_mut_slice(&mut inputs);
    let output_slice = HostSlice::from_mut_slice(&mut outputs);

-    let config = PoseidonConfig::default();
-    poseidon_hash_many::<F>(
-        input_slice,
-        output_slice,
-        test_size as u32,
-        arity as u32,
-        &constants,
-        &config,
-    )
-    .unwrap();
+    let cfg = poseidon.default_config();
+    poseidon
+        .hash_many(input_slice, output_slice, test_size, arity, 1, &cfg)
+        .unwrap();

    let a1 = output_slice[0];
-    let a2 = output_slice[output_slice.len() - 2];
+    let a2 = output_slice[output_slice.len() - 1];

-    println!("first: {:?}, last: {:?}", a1, a2);
    assert_eq!(a1, a2);
-
-    (a1, a2)
 }

 pub fn check_poseidon_hash_many<F: FieldImpl>()
 where
-    <F as FieldImpl>::Config: Poseidon<F>,
+    <F as FieldImpl>::Config: PoseidonImpl<F>,
 {
-    for arity in [2, 4] {
-        let constants = init_poseidon::<F>(arity as u32);
+    for arity in [2, 4, 8, 11] {
+        let poseidon = init_poseidon::<F>(arity);

-        _check_poseidon_hash_many(constants);
+        _check_poseidon_hash_many(poseidon);
    }
 }
-
-pub fn check_poseidon_custom_config<F: FieldImpl>(field_bytes: usize, field_prefix: &str, partial_rounds: u32)
-where
-    <F as FieldImpl>::Config: Poseidon<F>,
-{
-    let arity = 2u32;
-    let constants = init_poseidon::<F>(arity as u32);
-
-    let full_rounds_half = 4;
-
-    let ctx = DeviceContext::default();
-    let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
-    let constants_file = PathBuf::from(cargo_manifest_dir)
-        .join("tests")
-        .join(format!("{}_constants.bin", field_prefix));
-    let mut constants_buf = vec![];
-    File::open(constants_file)
-        .unwrap()
-        .read_to_end(&mut constants_buf)
-        .unwrap();
-
-    let mut custom_constants = vec![];
-    for chunk in constants_buf.chunks(field_bytes) {
-        custom_constants.push(F::from_bytes_le(chunk));
-    }
-
-    let custom_constants = create_optimized_poseidon_constants::<F>(
-        arity as u32,
-        &ctx,
-        full_rounds_half,
-        partial_rounds,
-        &mut custom_constants,
-    )
-    .unwrap();
-
-    let (a1, a2) = _check_poseidon_hash_many(constants);
-    let (b1, b2) = _check_poseidon_hash_many(custom_constants);
-
-    assert_eq!(a1, b1);
-    assert_eq!(a2, b2);
-}
--- a/wrappers/rust/icicle-core/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon2/mod.rs
@@ -1,107 +1,66 @@
 #[doc(hidden)]
 pub mod tests;

-use icicle_cuda_runtime::{
-    device::check_device,
-    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
-    memory::{DeviceSlice, HostOrDeviceSlice},
+use std::{ffi::c_void, marker::PhantomData};
+
+use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice};
+
+use crate::{
+    error::IcicleResult,
+    hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
+    traits::FieldImpl,
 };

-use crate::{error::IcicleResult, traits::FieldImpl};
-
 #[repr(C)]
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub enum DiffusionStrategy {
    Default,
    Montgomery,
 }

 #[repr(C)]
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub enum MdsType {
    Default,
    Plonky,
 }

-#[repr(C)]
-#[derive(Debug, Clone)]
-pub enum PoseidonMode {
-    Compression,
-    Permutation,
+pub type Poseidon2Handle = *const c_void;
+pub struct Poseidon2<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
+{
+    width: usize,
+    handle: Poseidon2Handle,
+    phantom: PhantomData<F>,
 }

-#[repr(C)]
-pub struct Poseidon2Constants<'a, F: FieldImpl> {
-    width: u32,
-
-    alpha: u32,
-
-    internal_rounds: u32,
-
-    external_rounds: u32,
-
-    round_constants: &'a DeviceSlice<F>,
-
-    inernal_matrix_diag: &'a DeviceSlice<F>,
-
-    pub mds_type: MdsType,
-
-    pub diffusion: DiffusionStrategy,
-}
-
-impl<F: FieldImpl> std::fmt::Debug for Poseidon2Constants<'_, F> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}, {}, {}, {}",
-            self.width, self.alpha, self.internal_rounds, self.external_rounds
-        )
+impl<F> Poseidon2<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
+{
+    pub fn load(
+        width: usize,
+        rate: usize,
+        mds_type: MdsType,
+        diffusion: DiffusionStrategy,
+        ctx: &DeviceContext,
+    ) -> IcicleResult<Self> {
+        <<F as FieldImpl>::Config as Poseidon2Impl<F>>::load(width as u32, rate as u32, mds_type, diffusion, ctx)
+            .and_then(|handle| {
+                Ok(Self {
+                    width,
+                    handle,
+                    phantom: PhantomData,
+                })
+            })
    }
-}

-/// Struct that encodes Poseidon parameters to be passed into the [poseidon_hash_many](poseidon_hash_many) function.
-#[repr(C)]
-#[derive(Debug, Clone)]
-pub struct Poseidon2Config<'a> {
-    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
-    pub ctx: DeviceContext<'a>,
-
-    are_states_on_device: bool,
-
-    are_outputs_on_device: bool,
-
-    pub mode: PoseidonMode,
-
-    pub output_index: u32,
-
-    /// Whether to run Poseidon asynchronously. If set to `true`, Poseidon will be non-blocking
-    /// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
-    /// If set to `false`, Poseidon will block the current CPU thread.
-    pub is_async: bool,
-}
-
-impl<'a> Default for Poseidon2Config<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
-}
-
-impl<'a> Poseidon2Config<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        Self {
-            ctx: DeviceContext::default_for_device(device_id),
-            are_states_on_device: false,
-            are_outputs_on_device: false,
-            mode: PoseidonMode::Compression,
-            output_index: 1,
-            is_async: false,
-        }
-    }
-}
-
-pub trait Poseidon2<F: FieldImpl> {
-    fn create_constants<'a>(
-        width: u32,
+    pub fn new(
+        width: usize,
+        rate: usize,
        alpha: u32,
        internal_rounds: u32,
        external_rounds: u32,
@@ -110,191 +69,122 @@ pub trait Poseidon2<F: FieldImpl> {
        mds_type: MdsType,
        diffusion: DiffusionStrategy,
        ctx: &DeviceContext,
-    ) -> IcicleResult<Poseidon2Constants<'a, F>>;
-    fn load_constants<'a>(
+    ) -> IcicleResult<Self> {
+        <<F as FieldImpl>::Config as Poseidon2Impl<F>>::create(
+            width as u32,
+            rate as u32,
+            alpha,
+            internal_rounds,
+            external_rounds,
+            round_constants,
+            internal_matrix_diag,
+            mds_type,
+            diffusion,
+            ctx,
+        )
+        .and_then(|handle| {
+            Ok(Self {
+                width,
+                handle,
+                phantom: PhantomData,
+            })
+        })
+    }
+}
+
+impl<F> SpongeHash<F, F> for Poseidon2<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
+{
+    fn get_handle(&self) -> *const c_void {
+        self.handle
+    }
+
+    fn hash_many(
+        &self,
+        inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        number_of_states: usize,
+        input_block_len: usize,
+        output_len: usize,
+        cfg: &SpongeConfig,
+    ) -> IcicleResult<()> {
+        sponge_check_input(
+            inputs,
+            number_of_states,
+            input_block_len,
+            cfg.input_rate as usize,
+            &cfg.ctx,
+        );
+        sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);
+
+        let mut local_cfg = cfg.clone();
+        local_cfg.are_inputs_on_device = inputs.is_on_device();
+        local_cfg.are_outputs_on_device = output.is_on_device();
+
+        <<F as FieldImpl>::Config as Poseidon2Impl<F>>::hash_many(
+            inputs,
+            output,
+            number_of_states as u32,
+            input_block_len as u32,
+            output_len as u32,
+            self.handle,
+            &local_cfg,
+        )
+    }
+
+    fn default_config<'a>(&self) -> SpongeConfig<'a> {
+        let mut cfg = SpongeConfig::default();
+        cfg.input_rate = self.width as u32;
+        cfg.output_rate = self.width as u32;
+        cfg
+    }
+}
+
+impl<F> Drop for Poseidon2<F>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
+{
+    fn drop(&mut self) {
+        <<F as FieldImpl>::Config as Poseidon2Impl<F>>::delete(self.handle).unwrap();
+    }
+}
+
+pub trait Poseidon2Impl<F: FieldImpl> {
+    fn create(
        width: u32,
+        rate: u32,
+        alpha: u32,
+        internal_rounds: u32,
+        external_rounds: u32,
+        round_constants: &[F],
+        internal_matrix_diag: &[F],
        mds_type: MdsType,
        diffusion: DiffusionStrategy,
        ctx: &DeviceContext,
-    ) -> IcicleResult<Poseidon2Constants<'a, F>>;
-    fn poseidon_unchecked(
-        states: &(impl HostOrDeviceSlice<F> + ?Sized),
+    ) -> IcicleResult<Poseidon2Handle>;
+
+    fn load(
+        width: u32,
+        rate: u32,
+        mds_type: MdsType,
+        diffusion: DiffusionStrategy,
+        ctx: &DeviceContext,
+    ) -> IcicleResult<Poseidon2Handle>;
+
+    fn hash_many(
+        inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
        number_of_states: u32,
-        width: u32,
-        constants: &Poseidon2Constants<F>,
-        config: &Poseidon2Config,
+        input_block_len: u32,
+        output_len: u32,
+        poseidon: Poseidon2Handle,
+        cfg: &SpongeConfig,
    ) -> IcicleResult<()>;
-    fn poseidon_unchecked_inplace(
-        states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
-        number_of_states: u32,
-        width: u32,
-        constants: &Poseidon2Constants<F>,
-        config: &Poseidon2Config,
-    ) -> IcicleResult<()>;
-    fn release_constants(constants: &Poseidon2Constants<F>, ctx: &DeviceContext) -> IcicleResult<()>;
-}

-/// Loads pre-calculated poseidon constants on the GPU.
-pub fn load_poseidon2_constants<'a, F>(
-    width: u32,
-    mds_type: MdsType,
-    diffusion: DiffusionStrategy,
-    ctx: &DeviceContext,
-) -> IcicleResult<Poseidon2Constants<'a, F>>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    <<F as FieldImpl>::Config as Poseidon2<F>>::load_constants(width, mds_type, diffusion, ctx)
-}
-
-/// Creates new instance of poseidon constants on the GPU.
-pub fn create_poseidon2_constants<'a, F>(
-    width: u32,
-    alpha: u32,
-    ctx: &DeviceContext,
-    internal_rounds: u32,
-    external_rounds: u32,
-    round_constants: &mut [F],
-    internal_matrix_diag: &mut [F],
-    mds_type: MdsType,
-    diffusion: DiffusionStrategy,
-) -> IcicleResult<Poseidon2Constants<'a, F>>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    <<F as FieldImpl>::Config as Poseidon2<F>>::create_constants(
-        width,
-        alpha,
-        internal_rounds,
-        external_rounds,
-        round_constants,
-        internal_matrix_diag,
-        mds_type,
-        diffusion,
-        ctx,
-    )
-}
-
-fn poseidon_checks<F>(
-    states: &(impl HostOrDeviceSlice<F> + ?Sized),
-    output: &(impl HostOrDeviceSlice<F> + ?Sized),
-    number_of_states: u32,
-    width: u32,
-    config: &Poseidon2Config,
-) where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    if states.len() < (number_of_states * width) as usize {
-        panic!(
-            "input len is {}; but needs to be at least {}",
-            states.len(),
-            number_of_states * width
-        );
-    }
-    if output.len() < number_of_states as usize {
-        panic!(
-            "output len is {}; but needs to be at least {}",
-            output.len(),
-            number_of_states
-        );
-    }
-
-    let ctx_device_id = config
-        .ctx
-        .device_id;
-    if let Some(device_id) = states.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in input and context are different"
-        );
-    }
-
-    if let Some(device_id) = output.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in output and context are different"
-        );
-    }
-    check_device(ctx_device_id);
-}
-
-/// Computes the poseidon hashes for multiple preimages.
-///
-/// # Arguments
-///
-/// * `input` - a pointer to the input data. May point to a vector of preimages or a vector of states filled with preimages.
-///
-/// * `output` - a pointer to the output data. Must be at least of size [number_of_states](number_of_states)
-///
-/// * `number_of_states` - number of input blocks of size `arity`
-///
-/// * `arity` - the arity of the hash function (the size of 1 preimage)
-///
-/// * `constants` - Poseidon constants.
-///
-/// * `config` - config used to specify extra arguments of the Poseidon.
-pub fn poseidon2_hash_many<F>(
-    states: &(impl HostOrDeviceSlice<F> + ?Sized),
-    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
-    number_of_states: u32,
-    width: u32,
-    constants: &Poseidon2Constants<F>,
-    config: &Poseidon2Config,
-) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    poseidon_checks(states, output, number_of_states, width, config);
-    let mut local_cfg = config.clone();
-    local_cfg.are_states_on_device = states.is_on_device();
-    local_cfg.are_outputs_on_device = output.is_on_device();
-
-    <<F as FieldImpl>::Config as Poseidon2<F>>::poseidon_unchecked(
-        states,
-        output,
-        number_of_states,
-        width,
-        constants,
-        &local_cfg,
-    )
-}
-
-pub fn poseidon2_hash_many_inplace<F>(
-    states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
-    number_of_states: u32,
-    width: u32,
-    constants: &Poseidon2Constants<F>,
-    config: &Poseidon2Config,
-) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    poseidon_checks(states, states, number_of_states, width, config);
-    let mut local_cfg = config.clone();
-    local_cfg.are_states_on_device = states.is_on_device();
-    local_cfg.are_outputs_on_device = states.is_on_device();
-
-    <<F as FieldImpl>::Config as Poseidon2<F>>::poseidon_unchecked_inplace(
-        states,
-        number_of_states,
-        width,
-        constants,
-        &local_cfg,
-    )
-}
-
-pub fn release_poseidon2_constants<'a, F>(constants: &Poseidon2Constants<F>, ctx: &DeviceContext) -> IcicleResult<()>
-where
-    F: FieldImpl,
-    <F as FieldImpl>::Config: Poseidon2<F>,
-{
-    <<F as FieldImpl>::Config as Poseidon2<F>>::release_constants(constants, ctx)
+    fn delete(poseidon: Poseidon2Handle) -> IcicleResult<()>;
 }

 #[macro_export]
@@ -307,140 +197,125 @@ macro_rules! impl_poseidon2 {
    ) => {
        mod $field_prefix_ident {
            use crate::poseidon2::{
-                $field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Config,
-                Poseidon2Constants,
+                $field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Handle,
+                SpongeConfig,
            };
+            use icicle_core::error::IcicleError;
            extern "C" {
-                #[link_name = concat!($field_prefix, "_create_poseidon2_constants_cuda")]
-                pub(crate) fn _create_constants(
+                #[link_name = concat!($field_prefix, "_poseidon2_create_cuda")]
+                pub(crate) fn create(
+                    poseidon: *mut Poseidon2Handle,
                    width: u32,
+                    rate: u32,
                    alpha: u32,
                    internal_rounds: u32,
                    external_rounds: u32,
-                    constants: *mut $field,
-                    internal_matrix_diag: *mut $field,
+                    constants: *const $field,
+                    internal_matrix_diag: *const $field,
                    mds_type: MdsType,
                    diffusion: DiffusionStrategy,
                    ctx: &DeviceContext,
-                    poseidon_constants: *mut Poseidon2Constants<$field>,
                ) -> CudaError;

-                #[link_name = concat!($field_prefix, "_init_poseidon2_constants_cuda")]
-                pub(crate) fn _load_constants(
+                #[link_name = concat!($field_prefix, "_poseidon2_load_cuda")]
+                pub(crate) fn load(
+                    poseidon: *mut Poseidon2Handle,
                    width: u32,
+                    rate: u32,
                    mds_type: MdsType,
                    diffusion: DiffusionStrategy,
                    ctx: &DeviceContext,
-                    constants: *mut Poseidon2Constants<$field>,
                ) -> CudaError;

-                #[link_name = concat!($field_prefix, "_release_poseidon2_constants_cuda")]
-                pub(crate) fn _release_constants(
-                    constants: &Poseidon2Constants<$field>,
-                    ctx: &DeviceContext,
-                ) -> CudaError;
+                #[link_name = concat!($field_prefix, "_poseidon2_delete_cuda")]
+                pub(crate) fn delete(poseidon: Poseidon2Handle) -> CudaError;

-                #[link_name = concat!($field_prefix, "_poseidon2_hash_cuda")]
+                #[link_name = concat!($field_prefix, "_poseidon2_hash_many_cuda")]
                pub(crate) fn hash_many(
-                    states: *const $field,
+                    poseidon: Poseidon2Handle,
+                    inputs: *const $field,
                    output: *mut $field,
                    number_of_states: u32,
-                    width: u32,
-                    constants: &Poseidon2Constants<$field>,
-                    config: &Poseidon2Config,
+                    input_block_len: u32,
+                    output_len: u32,
+                    cfg: &SpongeConfig,
                ) -> CudaError;
            }
        }

-        impl Poseidon2<$field> for $field_config {
-            fn create_constants<'a>(
+        impl Poseidon2Impl<$field> for $field_config {
+            fn create(
                width: u32,
+                rate: u32,
                alpha: u32,
                internal_rounds: u32,
                external_rounds: u32,
-                round_constants: &mut [$field],
-                internal_matrix_diag: &mut [$field],
+                round_constants: &[$field],
+                internal_matrix_diag: &[$field],
                mds_type: MdsType,
                diffusion: DiffusionStrategy,
                ctx: &DeviceContext,
-            ) -> IcicleResult<Poseidon2Constants<'a, $field>> {
+            ) -> IcicleResult<Poseidon2Handle> {
                unsafe {
-                    let mut poseidon_constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
-                    let err = $field_prefix_ident::_create_constants(
+                    let mut poseidon = MaybeUninit::<Poseidon2Handle>::uninit();
+                    $field_prefix_ident::create(
+                        poseidon.as_mut_ptr(),
                        width,
+                        rate,
                        alpha,
                        internal_rounds,
                        external_rounds,
-                        round_constants as *mut _ as *mut $field,
-                        internal_matrix_diag as *mut _ as *mut $field,
+                        round_constants as *const _ as *const $field,
+                        internal_matrix_diag as *const _ as *const $field,
                        mds_type,
                        diffusion,
                        ctx,
-                        poseidon_constants.as_mut_ptr(),
                    )
-                    .wrap();
-                    err.and(Ok(poseidon_constants.assume_init()))
+                    .wrap()
+                    .and(Ok(poseidon.assume_init()))
                }
            }

-            fn load_constants<'a>(
+            fn load(
                width: u32,
+                rate: u32,
                mds_type: MdsType,
                diffusion: DiffusionStrategy,
                ctx: &DeviceContext,
-            ) -> IcicleResult<Poseidon2Constants<'a, $field>> {
+            ) -> IcicleResult<Poseidon2Handle> {
                unsafe {
-                    let mut constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
-                    let err =
-                        $field_prefix_ident::_load_constants(width, mds_type, diffusion, ctx, constants.as_mut_ptr())
-                            .wrap();
-                    err.and(Ok(constants.assume_init()))
+                    let mut poseidon = MaybeUninit::<Poseidon2Handle>::uninit();
+                    $field_prefix_ident::load(poseidon.as_mut_ptr(), width, rate, mds_type, diffusion, ctx)
+                        .wrap()
+                        .and(Ok(poseidon.assume_init()))
                }
            }

-            fn poseidon_unchecked(
-                states: &(impl HostOrDeviceSlice<$field> + ?Sized),
+            fn hash_many(
+                inputs: &(impl HostOrDeviceSlice<$field> + ?Sized),
                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                number_of_states: u32,
-                width: u32,
-                constants: &Poseidon2Constants<$field>,
-                config: &Poseidon2Config,
+                input_block_len: u32,
+                output_len: u32,
+                poseidon: Poseidon2Handle,
+                cfg: &SpongeConfig,
            ) -> IcicleResult<()> {
                unsafe {
                    $field_prefix_ident::hash_many(
-                        states.as_ptr(),
+                        poseidon,
+                        inputs.as_ptr(),
                        output.as_mut_ptr(),
                        number_of_states,
-                        width,
-                        constants,
-                        config,
+                        input_block_len,
+                        output_len,
+                        cfg,
                    )
                    .wrap()
                }
            }

-            fn poseidon_unchecked_inplace(
-                states: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
-                number_of_states: u32,
-                width: u32,
-                constants: &Poseidon2Constants<$field>,
-                config: &Poseidon2Config,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::hash_many(
-                        states.as_ptr(),
-                        states.as_mut_ptr(),
-                        number_of_states,
-                        width,
-                        constants,
-                        config,
-                    )
-                    .wrap()
-                }
-            }
-
-            fn release_constants<'a>(constants: &Poseidon2Constants<$field>, ctx: &DeviceContext) -> IcicleResult<()> {
-                unsafe { $field_prefix_ident::_release_constants(constants, ctx).wrap() }
+            fn delete(poseidon: Poseidon2Handle) -> IcicleResult<()> {
+                unsafe { $field_prefix_ident::delete(poseidon).wrap() }
            }
        }
    };
@@ -466,42 +341,41 @@ pub mod bench {
    };

    use crate::{
+        hash::SpongeHash,
        ntt::FieldImpl,
-        poseidon2::{load_poseidon2_constants, DiffusionStrategy, MdsType},
+        poseidon2::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Impl},
        traits::GenerateRandom,
        vec_ops::VecOps,
    };

-    use super::{poseidon2_hash_many, Poseidon2, Poseidon2Config, Poseidon2Constants};
-
    #[allow(unused)]
-    fn poseidon2_for_bench<'a, F: FieldImpl>(
+    fn poseidon2_for_bench<F: FieldImpl>(
+        poseidon: &Poseidon2<F>,
        states: &(impl HostOrDeviceSlice<F> + ?Sized),
        poseidon2_result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
        number_of_states: usize,
-        width: usize,
-        constants: &Poseidon2Constants<'a, F>,
-        config: &Poseidon2Config,
+        ctx: &DeviceContext,
        _seed: u32,
    ) where
-        <F as FieldImpl>::Config: Poseidon2<F> + GenerateRandom<F>,
-        <F as FieldImpl>::Config: VecOps<F>,
+        <F as FieldImpl>::Config: Poseidon2Impl<F> + GenerateRandom<F>,
    {
-        poseidon2_hash_many(
-            states,
-            poseidon2_result,
-            number_of_states as u32,
-            width as u32,
-            constants,
-            config,
-        )
-        .unwrap();
+        let cfg = poseidon.default_config();
+        poseidon
+            .hash_many(
+                states,
+                poseidon2_result,
+                number_of_states,
+                poseidon.width,
+                poseidon.width,
+                &cfg,
+            )
+            .unwrap();
    }

    #[allow(unused)]
    pub fn benchmark_poseidon2<F: FieldImpl>(c: &mut Criterion)
    where
-        <F as FieldImpl>::Config: Poseidon2<F> + GenerateRandom<F>,
+        <F as FieldImpl>::Config: Poseidon2Impl<F> + GenerateRandom<F>,
        <F as FieldImpl>::Config: VecOps<F>,
    {
        use criterion::SamplingMode;
@@ -519,7 +393,7 @@ pub mod bench {
            .parse::<u32>()
            .unwrap_or(MAX_LOG2);

-        for test_size_log2 in 13u32..max_log2 + 1 {
+        for test_size_log2 in 18u32..max_log2 + 1 {
            for t in [2, 3, 4, 8, 16, 20, 24] {
                let number_of_states = 1 << test_size_log2;
                let full_size = t * number_of_states;
@@ -531,31 +405,27 @@ pub mod bench {
                let permutation_result_slice = HostSlice::from_mut_slice(&mut permutation_result);

                let ctx = DeviceContext::default();
-                let config = Poseidon2Config::default();
-                for mds in [MdsType::Default, MdsType::Plonky] {
-                    for diffusion in [DiffusionStrategy::Default, DiffusionStrategy::Montgomery] {
-                        let constants =
-                            load_poseidon2_constants(t as u32, mds.clone(), diffusion.clone(), &ctx).unwrap();
-                        let bench_descr = format!(
-                            "Mds::{:?}; Diffusion::{:?}; Number of states: {}; Width: {}",
-                            mds, diffusion, number_of_states, t
-                        );
-                        group.bench_function(&bench_descr, |b| {
-                            b.iter(|| {
-                                poseidon2_for_bench::<F>(
-                                    input,
-                                    permutation_result_slice,
-                                    number_of_states,
-                                    t,
-                                    &constants,
-                                    &config,
-                                    black_box(1),
-                                )
-                            })
-                        });
-
-                        // }
-                    }
+                for (mds, diffusion) in [
+                    (MdsType::Default, DiffusionStrategy::Default),
+                    (MdsType::Plonky, DiffusionStrategy::Montgomery),
+                ] {
+                    let poseidon = Poseidon2::<F>::load(t, t, mds, diffusion, &ctx).unwrap();
+                    let bench_descr = format!(
+                        "TestSize: 2**{}, Mds::{:?}, Diffusion::{:?}, Width: {}",
+                        test_size_log2, mds, diffusion, t
+                    );
+                    group.bench_function(&bench_descr, |b| {
+                        b.iter(|| {
+                            poseidon2_for_bench::<F>(
+                                &poseidon,
+                                input,
+                                permutation_result_slice,
+                                number_of_states,
+                                &ctx,
+                                black_box(1),
+                            )
+                        })
+                    });
                }
            }
        }
--- a/wrappers/rust/icicle-core/src/poseidon2/tests.rs
+++ b/wrappers/rust/icicle-core/src/poseidon2/tests.rs
@@ -1,27 +1,21 @@
-use crate::poseidon2::{MdsType, PoseidonMode};
+use crate::hash::SpongeHash;
 use crate::traits::FieldImpl;
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::memory::{HostOrDeviceSlice, HostSlice};

-use super::{
-    load_poseidon2_constants, poseidon2_hash_many, DiffusionStrategy, Poseidon2, Poseidon2Config, Poseidon2Constants,
-};
+use super::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Impl};

-pub fn init_poseidon<'a, F: FieldImpl>(
-    width: u32,
-    mds_type: MdsType,
-    diffusion: DiffusionStrategy,
-) -> Poseidon2Constants<'a, F>
+pub fn init_poseidon<F: FieldImpl>(width: usize, mds_type: MdsType, diffusion: DiffusionStrategy) -> Poseidon2<F>
 where
-    <F as FieldImpl>::Config: Poseidon2<F>,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
 {
    let ctx = DeviceContext::default();
-    load_poseidon2_constants::<F>(width, mds_type, diffusion, &ctx).unwrap()
+    Poseidon2::load(width, width, mds_type, diffusion, &ctx).unwrap()
 }

-fn _check_poseidon_hash_many<F: FieldImpl>(width: u32, constants: Poseidon2Constants<F>) -> (F, F)
+fn _check_poseidon_hash_many<F: FieldImpl>(width: usize, poseidon: &Poseidon2<F>) -> (F, F)
 where
-    <F as FieldImpl>::Config: Poseidon2<F>,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
 {
    let test_size = 1 << 10;
    let mut inputs = vec![F::one(); test_size * width as usize];
@@ -30,16 +24,10 @@ where
    let input_slice = HostSlice::from_mut_slice(&mut inputs);
    let output_slice = HostSlice::from_mut_slice(&mut outputs);

-    let config = Poseidon2Config::default();
-    poseidon2_hash_many::<F>(
-        input_slice,
-        output_slice,
-        test_size as u32,
-        width as u32,
-        &constants,
-        &config,
-    )
-    .unwrap();
+    let cfg = poseidon.default_config();
+    poseidon
+        .hash_many(input_slice, output_slice, test_size, width, 1, &cfg)
+        .unwrap();

    let a1 = output_slice[0];
    let a2 = output_slice[output_slice.len() - 2];
@@ -49,21 +37,22 @@ where
    (a1, a2)
 }

-pub fn check_poseidon_hash_many<'a, F: FieldImpl + 'a>()
+pub fn check_poseidon_hash_many<F: FieldImpl>()
 where
-    <F as FieldImpl>::Config: Poseidon2<F>,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
 {
    let widths = [2, 3, 4, 8, 12, 16, 20, 24];
+    let ctx = DeviceContext::default();
    for width in widths {
-        let constants = init_poseidon::<'a, F>(width as u32, MdsType::Default, DiffusionStrategy::Default);
+        let poseidon = Poseidon2::<F>::load(width, width, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();

-        _check_poseidon_hash_many(width, constants);
+        _check_poseidon_hash_many(width, &poseidon);
    }
 }

-pub fn check_poseidon_kats<'a, F: FieldImpl>(width: usize, kats: &[F], constants: &Poseidon2Constants<'a, F>)
+pub fn check_poseidon_kats<F: FieldImpl>(width: usize, kats: &[F], poseidon: &Poseidon2<F>)
 where
-    <F as FieldImpl>::Config: Poseidon2<F>,
+    <F as FieldImpl>::Config: Poseidon2Impl<F>,
 {
    assert_eq!(width, kats.len());

@@ -83,17 +72,11 @@ where
    let input_slice = HostSlice::from_mut_slice(&mut inputs);
    let output_slice = HostSlice::from_mut_slice(&mut outputs);

-    let mut config = Poseidon2Config::default();
-    config.mode = PoseidonMode::Permutation;
-    poseidon2_hash_many::<F>(
-        input_slice,
-        output_slice,
-        batch_size as u32,
-        width as u32,
-        &constants,
-        &config,
-    )
-    .unwrap();
+    let cfg = poseidon.default_config();
+
+    poseidon
+        .hash_many(input_slice, output_slice, batch_size, width, width, &cfg)
+        .unwrap();

    for (i, val) in output_slice
        .iter()
--- a/wrappers/rust/icicle-core/src/tree/mmcs.rs
+++ b/wrappers/rust/icicle-core/src/tree/mmcs.rs
@@ -0,0 +1,79 @@
+use icicle_cuda_runtime::memory::HostSlice;
+
+use crate::{error::IcicleResult, ntt::FieldImpl};
+use crate::{hash::SpongeHash, Matrix};
+
+use super::TreeBuilderConfig;
+
+pub trait FieldMmcs<F, Compression, Hasher>
+where
+    F: FieldImpl,
+    Compression: SpongeHash<F, F>,
+    Hasher: SpongeHash<F, F>,
+{
+    fn mmcs_commit(
+        leaves: Vec<Matrix>,
+        digests: &mut HostSlice<F>,
+        hasher: &Hasher,
+        compression: &Compression,
+        config: &TreeBuilderConfig,
+    ) -> IcicleResult<()>;
+}
+
+#[macro_export]
+macro_rules! impl_mmcs {
+    (
+      $field_prefix:literal,
+      $field_prefix_ident:ident,
+      $field:ident,
+      $field_config:ident,
+      $mmcs:ident
+    ) => {
+        mod $field_prefix_ident {
+            use super::*;
+            use icicle_cuda_runtime::error::CudaError;
+
+            extern "C" {
+                #[link_name = concat!($field_prefix, "_mmcs_commit_cuda")]
+                pub(crate) fn mmcs_commit_cuda(
+                    leaves: *const Matrix,
+                    number_of_inputs: u32,
+                    digests: *mut $field,
+                    hasher: *const c_void,
+                    compression: *const c_void,
+                    config: &TreeBuilderConfig,
+                ) -> CudaError;
+            }
+        }
+
+        struct $mmcs;
+
+        impl<Compression, Hasher> FieldMmcs<$field, Compression, Hasher> for $mmcs
+        where
+            Compression: SpongeHash<$field, $field>,
+            Hasher: SpongeHash<$field, $field>,
+        {
+            fn mmcs_commit(
+                leaves: Vec<Matrix>,
+                digests: &mut HostSlice<$field>,
+                hasher: &Hasher,
+                compression: &Compression,
+                config: &TreeBuilderConfig,
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::mmcs_commit_cuda(
+                        leaves
+                            .as_slice()
+                            .as_ptr(),
+                        leaves.len() as u32,
+                        digests.as_mut_ptr(),
+                        compression.get_handle(),
+                        hasher.get_handle(),
+                        config,
+                    )
+                    .wrap()
+                }
+            }
+        }
+    };
+}
--- a/Show More
+++ b/Show More