Feat/roman/poseidon2 (#507)

This PR adds support for poseidon2 permutation function as described in https://eprint.iacr.org/2023/323.pdf Reference implementations used (and compared against): https://github.com/HorizenLabs/poseidon2/tree/main https://github.com/Plonky3/Plonky3/tree/main Tasks: - [x] Remove commented code and prints - [ ] Add doc-comments to functions and structs - [x] Fix possible issue with Plonky3 imports - [x] Update NTT/Plonky3 test - [x] Add Plonky3-bn254 test (impossible)
2026-01-06 22:24:06 -05:00 · 2024-05-09 15:13:43 +07:00
parent c30e333819
commit 094683d291
27 changed files with 6070 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.so
 *.nsys-rep
 *.ncu-rep
+*.sage.py
 **/target
 **/.vscode
 **/.*lock*csv#
@@ -17,5 +18,3 @@
 **/icicle/build/
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
 **/build*
-**/icicle/appUtils/large_ntt/work
-icicle/appUtils/large_ntt/work/test_ntt
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -125,6 +125,17 @@ public:
  struct Wide {
    ff_wide_storage limbs_storage;

+    static constexpr Wide HOST_DEVICE_INLINE from_field(const Field& xs)
+    {
+      Wide out{};
+#ifdef __CUDA_ARCH__
+      UNROLL
+#endif
+      for (unsigned i = 0; i < TLC; i++)
+        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
+      return out;
+    }
+
    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
    {
      Field out{};
--- a/icicle/include/poseidon2/constants/babybear_poseidon2.h
+++ b/icicle/include/poseidon2/constants/babybear_poseidon2.h
--- a/icicle/include/poseidon2/constants/bn254_poseidon2.h
+++ b/icicle/include/poseidon2/constants/bn254_poseidon2.h
--- a/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
+++ b/icicle/include/poseidon2/constants/poseidon2_rust_params.sage
@@ -0,0 +1,705 @@
+# Remark: This script contains functionality for GF(2^n), but currently works only over GF(p)! A few small adaptations are needed for GF(2^n).
+from sage.rings.polynomial.polynomial_gf2x import GF2X_BuildIrred_list
+from math import *
+import itertools
+
+CURVE_NAME = "bn254"
+
+###########################################################################
+# p = 18446744069414584321 # GoldiLocks
+# p = 2013265921 # BabyBear
+# p = 52435875175126190479447740508185965837690552500527637822603658699938581184513 # BLS12-381
+p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
+# p = 28948022309329048855892746252171976963363056481941560715954676764349967630337 # Pasta (Pallas)
+# p = 28948022309329048855892746252171976963363056481941647379679742748393362948097 # Pasta (Vesta)
+
+n = len(p.bits()) # bit
+# t = 12 # GoldiLocks (t = 12 for sponge, t = 8 for compression)
+# t = 16 # BabyBear (t = 24 for sponge, t = 16 for compression)
+# t = 3 # BN254/BN256, BLS12-381, Pallas, Vesta (t = 3 for sponge, t = 2 for compression)
+
+TS = [2, 3, 4, 8, 12, 16, 20, 24]
+
+FIELD = 1
+SBOX = 0
+FIELD_SIZE = n
+
+def get_alpha(p):
+    for alpha in range(3, p):
+        if gcd(alpha, p-1) == 1:
+            break
+    return alpha
+
+alpha = get_alpha(p)
+
+def get_sbox_cost(R_F, R_P, N, t):
+    return int(t * R_F + R_P)
+
+def get_size_cost(R_F, R_P, N, t):
+    n = ceil(float(N) / t)
+    return int((N * R_F) + (n * R_P))
+
+def poseidon_calc_final_numbers_fixed(p, t, alpha, M, security_margin):
+    # [Min. S-boxes] Find best possible for t and N
+    n = ceil(log(p, 2))
+    N = int(n * t)
+    cost_function = get_sbox_cost
+    ret_list = []
+    (R_F, R_P) = find_FD_round_numbers(p, t, alpha, M, cost_function, security_margin)
+    min_sbox_cost = cost_function(R_F, R_P, N, t)
+    ret_list.append(R_F)
+    ret_list.append(R_P)
+    ret_list.append(min_sbox_cost)
+
+    # [Min. Size] Find best possible for t and N
+    # Minimum number of S-boxes for fixed n results in minimum size also (round numbers are the same)!
+    min_size_cost = get_size_cost(R_F, R_P, N, t)
+    ret_list.append(min_size_cost)
+
+    return ret_list # [R_F, R_P, min_sbox_cost, min_size_cost]
+
+def find_FD_round_numbers(p, t, alpha, M, cost_function, security_margin):
+    n = ceil(log(p, 2))
+    N = int(n * t)
+
+    sat_inequiv = sat_inequiv_alpha
+
+    R_P = 0
+    R_F = 0
+    min_cost = float("inf")
+    max_cost_rf = 0
+    # Brute-force approach
+    for R_P_t in range(1, 500):
+        for R_F_t in range(4, 100):
+            if R_F_t % 2 == 0:
+                if (sat_inequiv(p, t, R_F_t, R_P_t, alpha, M) == True):
+                    if security_margin == True:
+                        R_F_t += 2
+                        R_P_t = int(ceil(float(R_P_t) * 1.075))
+                    cost = cost_function(R_F_t, R_P_t, N, t)
+                    if (cost < min_cost) or ((cost == min_cost) and (R_F_t < max_cost_rf)):
+                        R_P = ceil(R_P_t)
+                        R_F = ceil(R_F_t)
+                        min_cost = cost
+                        max_cost_rf = R_F
+    return (int(R_F), int(R_P))
+
+def sat_inequiv_alpha(p, t, R_F, R_P, alpha, M):
+    N = int(FIELD_SIZE * NUM_CELLS)
+    
+    if alpha > 0:
+        R_F_1 = 6 if M <= ((floor(log(p, 2) - ((alpha-1)/2.0))) * (t + 1)) else 10 # Statistical
+        R_F_2 = 1 + ceil(log(2, alpha) * min(M, FIELD_SIZE)) + ceil(log(t, alpha)) - R_P # Interpolation
+        R_F_3 = (log(2, alpha) * min(M, log(p, 2))) - R_P # Groebner 1
+        R_F_4 = t - 1 + log(2, alpha) * min(M / float(t + 1), log(p, 2) / float(2)) - R_P # Groebner 2
+        R_F_5 = (t - 2 + (M / float(2 * log(alpha, 2))) - R_P) / float(t - 1) # Groebner 3
+        R_F_max = max(ceil(R_F_1), ceil(R_F_2), ceil(R_F_3), ceil(R_F_4), ceil(R_F_5))
+        
+        # Addition due to https://eprint.iacr.org/2023/537.pdf
+        r_temp = floor(t / 3.0)
+        over = (R_F - 1) * t + R_P + r_temp + r_temp * (R_F / 2.0) + R_P + alpha
+        under = r_temp * (R_F / 2.0) + R_P + alpha
+        binom_log = log(binomial(over, under), 2)
+        if binom_log == inf:
+            binom_log = M + 1
+        cost_gb4 = ceil(2 * binom_log) # Paper uses 2.3727, we are more conservative here
+
+        return ((R_F >= R_F_max) and (cost_gb4 >= M))
+    else:
+        print("Invalid value for alpha!")
+        exit(1)
+
+def grain_sr_generator():
+    bit_sequence = INIT_SEQUENCE
+    for _ in range(0, 160):
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+
+    while True:
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+        while new_bit == 0:
+            new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+            bit_sequence.pop(0)
+            bit_sequence.append(new_bit)
+            new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+            bit_sequence.pop(0)
+            bit_sequence.append(new_bit)
+        new_bit = bit_sequence[62] ^^ bit_sequence[51] ^^ bit_sequence[38] ^^ bit_sequence[23] ^^ bit_sequence[13] ^^ bit_sequence[0]
+        bit_sequence.pop(0)
+        bit_sequence.append(new_bit)
+        yield new_bit
+
+def grain_random_bits(num_bits):
+    random_bits = [next(grain_gen) for i in range(0, num_bits)]
+    # random_bits.reverse() ## Remove comment to start from least significant bit
+    random_int = int("".join(str(i) for i in random_bits), 2)
+    return random_int
+
+def init_generator(field, sbox, n, t, R_F, R_P):
+    # Generate initial sequence based on parameters
+    bit_list_field = [_ for _ in (bin(FIELD)[2:].zfill(2))]
+    bit_list_sbox = [_ for _ in (bin(SBOX)[2:].zfill(4))]
+    bit_list_n = [_ for _ in (bin(FIELD_SIZE)[2:].zfill(12))]
+    bit_list_t = [_ for _ in (bin(NUM_CELLS)[2:].zfill(12))]
+    bit_list_R_F = [_ for _ in (bin(R_F)[2:].zfill(10))]
+    bit_list_R_P = [_ for _ in (bin(R_P)[2:].zfill(10))]
+    bit_list_1 = [1] * 30
+    global INIT_SEQUENCE
+    INIT_SEQUENCE = bit_list_field + bit_list_sbox + bit_list_n + bit_list_t + bit_list_R_F + bit_list_R_P + bit_list_1
+    INIT_SEQUENCE = [int(_) for _ in INIT_SEQUENCE]
+
+def generate_constants(field, n, t, R_F, R_P, prime_number):
+    round_constants = []
+    # num_constants = (R_F + R_P) * t # Poseidon
+    num_constants = (R_F * t) + R_P # Poseidon2
+
+    if field == 0:
+        for i in range(0, num_constants):
+            random_int = grain_random_bits(n)
+            round_constants.append(random_int)
+    elif field == 1:
+        for i in range(0, num_constants):
+            random_int = grain_random_bits(n)
+            while random_int >= prime_number:
+                # print("[Info] Round constant is not in prime field! Taking next one.")
+                random_int = grain_random_bits(n)
+            round_constants.append(random_int)
+            # Add (t-1) zeroes for Poseidon2 if partial round
+            if i >= ((R_F/2) * t) and i < (((R_F/2) * t) + R_P):
+                round_constants.extend([0] * (t-1))
+    return round_constants
+
+def print_round_constants(round_constants, n, field):
+    print("Number of round constants:", len(round_constants))
+
+    if field == 0:
+        print("Round constants for GF(2^n):")
+    elif field == 1:
+        print("Round constants for GF(p):")
+    hex_length = int(ceil(float(n) / 4)) + 2 # +2 for "0x"
+    print(["{0:#0{1}x}".format(entry, hex_length) for entry in round_constants])
+
+def create_mds_p(n, t):
+    M = matrix(F, t, t)
+
+    # Sample random distinct indices and assign to xs and ys
+    while True:
+        flag = True
+        rand_list = [F(grain_random_bits(n)) for _ in range(0, 2*t)]
+        while len(rand_list) != len(set(rand_list)): # Check for duplicates
+            rand_list = [F(grain_random_bits(n)) for _ in range(0, 2*t)]
+        xs = rand_list[:t]
+        ys = rand_list[t:]
+        # xs = [F(ele) for ele in range(0, t)]
+        # ys = [F(ele) for ele in range(t, 2*t)]
+        for i in range(0, t):
+            for j in range(0, t):
+                if (flag == False) or ((xs[i] + ys[j]) == 0):
+                    flag = False
+                else:
+                    entry = (xs[i] + ys[j])^(-1)
+                    M[i, j] = entry
+        if flag == False:
+            continue
+        return M
+
+def generate_vectorspace(round_num, M, M_round, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+    V = VectorSpace(F, t)
+    if round_num == 0:
+        return V
+    elif round_num == 1:
+        return V.subspace(V.basis()[s:])
+    else:
+        mat_temp = matrix(F)
+        for i in range(0, round_num-1):
+            add_rows = []
+            for j in range(0, s):
+                add_rows.append(M_round[i].rows()[j][s:])
+            mat_temp = matrix(mat_temp.rows() + add_rows)
+        r_k = mat_temp.right_kernel()
+        extended_basis_vectors = []
+        for vec in r_k.basis():
+            extended_basis_vectors.append(vector([0]*s + list(vec)))
+        S = V.subspace(extended_basis_vectors)
+
+        return S
+
+def subspace_times_matrix(subspace, M, NUM_CELLS):
+    t = NUM_CELLS
+    V = VectorSpace(F, t)
+    subspace_basis = subspace.basis()
+    new_basis = []
+    for vec in subspace_basis:
+        new_basis.append(M * vec)
+    new_subspace = V.subspace(new_basis)
+    return new_subspace
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_1(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+    r = floor((t - s) / float(s))
+
+    # Generate round matrices
+    M_round = []
+    for j in range(0, t+1):
+        M_round.append(M^(j+1))
+
+    for i in range(1, r+1):
+        mat_test = M^i
+        entry = mat_test[0, 0]
+        mat_target = matrix.circulant(vector([entry] + ([F(0)] * (t-1))))
+
+        if (mat_test - mat_target) == matrix.circulant(vector([F(0)] * (t))):
+            return [False, 1]
+
+        S = generate_vectorspace(i, M, M_round, t)
+        V = VectorSpace(F, t)
+
+        basis_vectors= []
+        for eigenspace in mat_test.eigenspaces_right(format='galois'):
+            if (eigenspace[0] not in F):
+                continue
+            vector_subspace = eigenspace[1]
+            intersection = S.intersection(vector_subspace)
+            basis_vectors += intersection.basis()
+        IS = V.subspace(basis_vectors)
+
+        if IS.dimension() >= 1 and IS != V:
+            return [False, 2]
+        for j in range(1, i+1):
+            S_mat_mul = subspace_times_matrix(S, M^j, t)
+            if S == S_mat_mul:
+                print("S.basis():\n", S.basis())
+                return [False, 3]
+
+    return [True, 0]
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_2(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+
+    V = VectorSpace(F, t)
+    trail = [None, None]
+    test_next = False
+    I = range(0, s)
+    I_powerset = list(sage.misc.misc.powerset(I))[1:]
+    for I_s in I_powerset:
+        test_next = False
+        new_basis = []
+        for l in I_s:
+            new_basis.append(V.basis()[l])
+        IS = V.subspace(new_basis)
+        for i in range(s, t):
+            new_basis.append(V.basis()[i])
+        full_iota_space = V.subspace(new_basis)
+        for l in I_s:
+            v = V.basis()[l]
+            while True:
+                delta = IS.dimension()
+                v = M * v
+                IS = V.subspace(IS.basis() + [v])
+                if IS.dimension() == t or IS.intersection(full_iota_space) != IS:
+                    test_next = True
+                    break
+                if IS.dimension() <= delta:
+                    break
+            if test_next == True:
+                break
+        if test_next == True:
+            continue
+        return [False, [IS, I_s]]
+
+    return [True, None]
+
+# Returns True if the matrix is considered secure, False otherwise
+def algorithm_3(M, NUM_CELLS):
+    t = NUM_CELLS
+    s = 1
+
+    V = VectorSpace(F, t)
+
+    l = 4*t
+    for r in range(2, l+1):
+        next_r = False
+        res_alg_2 = algorithm_2(M^r, t)
+        if res_alg_2[0] == False:
+            return [False, None]
+
+        # if res_alg_2[1] == None:
+        #     continue
+        # IS = res_alg_2[1][0]
+        # I_s = res_alg_2[1][1]
+        # for j in range(1, r):
+        #     IS = subspace_times_matrix(IS, M, t)
+        #     I_j = []
+        #     for i in range(0, s):
+        #         new_basis = []
+        #         for k in range(0, t):
+        #             if k != i:
+        #                 new_basis.append(V.basis()[k])
+        #         iota_space = V.subspace(new_basis)
+        #         if IS.intersection(iota_space) != iota_space:
+        #             single_iota_space = V.subspace([V.basis()[i]])
+        #             if IS.intersection(single_iota_space) == single_iota_space:
+        #                 I_j.append(i)
+        #             else:
+        #                 next_r = True
+        #                 break
+        #     if next_r == True:
+        #         break
+        # if next_r == True:
+        #     continue
+        # return [False, [IS, I_j, r]]
+
+    return [True, None]
+
+def check_minpoly_condition(M, NUM_CELLS):
+    max_period = 2*NUM_CELLS
+    all_fulfilled = True
+    M_temp = M
+    for i in range(1, max_period + 1):
+        if not ((M_temp.minimal_polynomial().degree() == NUM_CELLS) and (M_temp.minimal_polynomial().is_irreducible() == True)):
+            all_fulfilled = False
+            break
+        M_temp = M * M_temp
+    return all_fulfilled
+
+def generate_matrix(FIELD, FIELD_SIZE, NUM_CELLS):
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        mds_matrix = create_mds_p(FIELD_SIZE, NUM_CELLS)
+        result_1 = algorithm_1(mds_matrix, NUM_CELLS)
+        result_2 = algorithm_2(mds_matrix, NUM_CELLS)
+        result_3 = algorithm_3(mds_matrix, NUM_CELLS)
+        while result_1[0] == False or result_2[0] == False or result_3[0] == False:
+            mds_matrix = create_mds_p(FIELD_SIZE, NUM_CELLS)
+            result_1 = algorithm_1(mds_matrix, NUM_CELLS)
+            result_2 = algorithm_2(mds_matrix, NUM_CELLS)
+            result_3 = algorithm_3(mds_matrix, NUM_CELLS)
+        return mds_matrix
+
+def generate_matrix_full(NUM_CELLS):
+    M = None
+    if t == 2:
+        M = matrix.circulant(vector([F(2), F(1)]))
+    elif t == 3:
+        M = matrix.circulant(vector([F(2), F(1), F(1)]))
+    elif t == 4:
+        M = matrix(F, [[F(5), F(7), F(1), F(3)], [F(4), F(6), F(1), F(1)], [F(1), F(3), F(5), F(7)], [F(1), F(1), F(4), F(6)]])
+    elif (t % 4) == 0:
+        M = matrix(F, t, t)
+        # M_small = matrix.circulant(vector([F(3), F(2), F(1), F(1)]))
+        M_small = matrix(F, [[F(5), F(7), F(1), F(3)], [F(4), F(6), F(1), F(1)], [F(1), F(3), F(5), F(7)], [F(1), F(1), F(4), F(6)]])
+        small_num = t // 4
+        for i in range(0, small_num):
+            for j in range(0, small_num):
+                if i == j:
+                    M[i*4:(i+1)*4,j*4:(j+1)*4] = 2* M_small
+                else:
+                    M[i*4:(i+1)*4,j*4:(j+1)*4] = M_small
+    else:
+        print("Error: No matrix for these parameters.")
+        exit()
+    return M
+
+def generate_matrix_partial(FIELD, FIELD_SIZE, NUM_CELLS): ## TODO: Prioritize small entries
+    entry_max_bit_size = FIELD_SIZE
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        M = None
+        if t == 2:
+            M = matrix(F, [[F(2), F(1)], [F(1), F(3)]])
+        elif t == 3:
+            M = matrix(F, [[F(2), F(1), F(1)], [F(1), F(2), F(1)], [F(1), F(1), F(3)]])
+        else:
+            M_circulant = matrix.circulant(vector([F(0)] + [F(1) for _ in range(0, NUM_CELLS - 1)]))
+            M_diagonal = matrix.diagonal([F(grain_random_bits(entry_max_bit_size)) for _ in range(0, NUM_CELLS)])
+            M = M_circulant + M_diagonal
+            # while algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False:
+            while check_minpoly_condition(M, NUM_CELLS) == False:
+                M_diagonal = matrix.diagonal([F(grain_random_bits(entry_max_bit_size)) for _ in range(0, NUM_CELLS)])
+                M = M_circulant + M_diagonal
+        
+        if(algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False):
+            print("Error: Generated partial matrix is not secure w.r.t. subspace trails.")
+            exit()
+        return M
+
+def generate_matrix_partial_small_entries(FIELD, FIELD_SIZE, NUM_CELLS):
+    if FIELD == 0:
+        print("Matrix generation not implemented for GF(2^n).")
+        exit(1)
+    elif FIELD == 1:
+        M_circulant = matrix.circulant(vector([F(0)] + [F(1) for _ in range(0, NUM_CELLS - 1)]))
+        combinations = list(itertools.product(range(2, 6), repeat=NUM_CELLS))
+        for entry in combinations:
+            M = M_circulant + matrix.diagonal(vector(F, list(entry)))
+            print(M)
+            # if M.is_invertible() == False or algorithm_1(M, NUM_CELLS)[0] == False or algorithm_2(M, NUM_CELLS)[0] == False or algorithm_3(M, NUM_CELLS)[0] == False:
+            if M.is_invertible() == False or check_minpoly_condition(M, NUM_CELLS) == False:
+                continue
+            return M
+
+def matrix_partial_m_1(matrix_partial, NUM_CELLS):
+    M_circulant = matrix.identity(F, NUM_CELLS)
+    return matrix_partial - M_circulant
+
+def print_linear_layer(M, n, t):
+    print("n:", n)
+    print("t:", t)
+    print("N:", (n * t))
+    print("Result Algorithm 1:\n", algorithm_1(M, NUM_CELLS))
+    print("Result Algorithm 2:\n", algorithm_2(M, NUM_CELLS))
+    print("Result Algorithm 3:\n", algorithm_3(M, NUM_CELLS))
+    hex_length = int(ceil(float(n) / 4)) + 2 # +2 for "0x"
+    print("Prime number:", "0x" + hex(PRIME_NUMBER))
+    matrix_string = "["
+    for i in range(0, t):
+        matrix_string += str(["{0:#0{1}x}".format(int(entry), hex_length) for entry in M[i]])
+        if i < (t-1):
+            matrix_string += ","
+    matrix_string += "]"
+    print("MDS matrix:\n", matrix_string)
+
+def calc_equivalent_matrices(MDS_matrix_field):
+    # Following idea: Split M into M' * M'', where M'' is "cheap" and M' can move before the partial nonlinear layer
+    # The "previous" matrix layer is then M * M'. Due to the construction of M', the M[0,0] and v values will be the same for the new M' (and I also, obviously)
+    # Thus: Compute the matrices, store the w_hat and v_hat values
+
+    MDS_matrix_field_transpose = MDS_matrix_field.transpose()
+
+    w_hat_collection = []
+    v_collection = []
+    v = MDS_matrix_field_transpose[[0], list(range(1,t))]
+
+    M_mul = MDS_matrix_field_transpose
+    M_i = matrix(F, t, t)
+    for i in range(R_P_FIXED - 1, -1, -1):
+        M_hat = M_mul[list(range(1,t)), list(range(1,t))]
+        w = M_mul[list(range(1,t)), [0]]
+        v = M_mul[[0], list(range(1,t))]
+        v_collection.append(v.list())
+        w_hat = M_hat.inverse() * w
+        w_hat_collection.append(w_hat.list())
+
+        # Generate new M_i, and multiplication M * M_i for "previous" round
+        M_i = matrix.identity(t)
+        M_i[list(range(1,t)), list(range(1,t))] = M_hat
+        M_mul = MDS_matrix_field_transpose * M_i
+
+    return M_i, v_collection, w_hat_collection, MDS_matrix_field_transpose[0, 0]
+
+def calc_equivalent_constants(constants, MDS_matrix_field):
+    constants_temp = [constants[index:index+t] for index in range(0, len(constants), t)]
+
+    MDS_matrix_field_transpose = MDS_matrix_field.transpose()
+
+    # Start moving round constants up
+    # Calculate c_i' = M^(-1) * c_(i+1)
+    # Split c_i': Add c_i'[0] AFTER the S-box, add the rest to c_i
+    # I.e.: Store c_i'[0] for each of the partial rounds, and make c_i = c_i + c_i' (where now c_i'[0] = 0)
+    num_rounds = R_F_FIXED + R_P_FIXED
+    R_f = R_F_FIXED / 2
+    for i in range(num_rounds - 2 - R_f, R_f - 1, -1):
+        inv_cip1 = list(vector(constants_temp[i+1]) * MDS_matrix_field_transpose.inverse())
+        constants_temp[i] = list(vector(constants_temp[i]) + vector([0] + inv_cip1[1:]))
+        constants_temp[i+1] = [inv_cip1[0]] + [0] * (t-1)
+
+    return constants_temp
+
+def poseidon(input_words, matrix, round_constants):
+
+    R_f = int(R_F_FIXED / 2)
+
+    round_constants_counter = 0
+
+    state_words = list(input_words)
+
+    # First full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    # Middle partial rounds
+    for r in range(0, R_P_FIXED):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        state_words[0] = (state_words[0])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    # Last full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix * vector(state_words))
+
+    return state_words
+
+def poseidon2(input_words, matrix_full, matrix_partial, round_constants):
+
+    R_f = int(R_F_FIXED / 2)
+
+    round_constants_counter = 0
+
+    state_words = list(input_words)
+
+    # First matrix mul
+    state_words = list(matrix_full * vector(state_words))
+
+    # First full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix_full * vector(state_words))
+
+    # Middle partial rounds
+    for r in range(0, R_P_FIXED):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        state_words[0] = (state_words[0])^alpha
+        state_words = list(matrix_partial * vector(state_words))
+
+    # Last full rounds
+    for r in range(0, R_f):
+        # Round constants, nonlinear layer, matrix multiplication
+        for i in range(0, t):
+            state_words[i] = state_words[i] + round_constants[round_constants_counter]
+            round_constants_counter += 1
+        for i in range(0, t):
+            state_words[i] = (state_words[i])^alpha
+        state_words = list(matrix_full * vector(state_words))
+
+    return state_words
+
+def to_bytes(value, indent):
+    l = len(hex(p - 1))
+    if l % 2 == 1:
+        l = l + 1
+    value = hex(int(value))[2:]
+    value = value.zfill(l - 2)
+    value_bytes = reversed(["0x" + value[i:i + 2] for i in range(0, len(value), 2)])
+    print(" " * indent +  ", ".join(value_bytes) + ",")
+
+print("#pragma once")
+print(f"#ifndef {CURVE_NAME.upper()}_POSEIDON2_H")
+print(f"#define {CURVE_NAME.upper()}_POSEIDON2_H")
+print()
+print(f"namespace poseidon2_constants_{CURVE_NAME} {{")
+
+for t in TS:
+    NUM_CELLS = t
+    R_F_FIXED, R_P_FIXED, _, _ = poseidon_calc_final_numbers_fixed(p, t, alpha, 128, True)
+
+    INIT_SEQUENCE = []
+
+    PRIME_NUMBER = p
+
+    F = GF(PRIME_NUMBER)
+
+    grain_gen = grain_sr_generator()
+
+    # Init
+    init_generator(FIELD, SBOX, FIELD_SIZE, NUM_CELLS, R_F_FIXED, R_P_FIXED)
+
+    # Round constants
+    round_constants = generate_constants(FIELD, FIELD_SIZE, NUM_CELLS, R_F_FIXED, R_P_FIXED, PRIME_NUMBER)
+    # print_round_constants(round_constants, FIELD_SIZE, FIELD)
+
+    # Matrix
+    # MDS = generate_matrix(FIELD, FIELD_SIZE, NUM_CELLS)
+    MATRIX_FULL = generate_matrix_full(NUM_CELLS)
+    MATRIX_PARTIAL = generate_matrix_partial(FIELD, FIELD_SIZE, NUM_CELLS)
+    MATRIX_PARTIAL_DIAGONAL_M_1 = [matrix_partial_m_1(MATRIX_PARTIAL, NUM_CELLS)[i,i] for i in range(0, NUM_CELLS)]
+
+    print()
+    print(f"  namespace t{t} {{")
+    print(f"    int internal_rounds = {R_P_FIXED};")
+    print()
+    print(f"    int alpha = {alpha};")
+    print()
+
+    # # MDS
+    # print("pub static ref MDS{}: Vec<Vec<Scalar>> = vec![".format(t))
+    # for vec in MDS:
+    #     print("vec![", end="")
+    #     for val in vec:
+    #         to_hex(val)
+    #     print("],")
+    # print("];")
+    # print()
+
+    # Efficient partial matrix (diagonal - 1)
+    print("    unsigned char mat_diag_m_1[] = {")
+    for val in MATRIX_PARTIAL_DIAGONAL_M_1:
+        to_bytes(val, 6)
+    print("    };")
+    print()
+
+    # # Efficient partial matrix (full)
+    # print("    unsigned char mat_internal[] = {")
+    # for vec in MATRIX_PARTIAL:
+    #     for val in vec:
+    #         to_bytes(val, 6)
+    #     print()
+    # print("    };")
+    # print()
+
+    # Round constants
+    print("    unsigned char round_constants[] = {")
+    for (i,val) in enumerate(round_constants):
+        if (i % t == 0 or (i < (R_F_FIXED / 2) * t) or (i > (R_F_FIXED / 2 + R_P_FIXED) * t)):
+            to_bytes(val, 6)
+    print("    };")
+    print("  }")
+    print()
+
+    # state_in  = vector([F(i) for i in range(t)])
+    # # state_out = poseidon(state_in, MDS, round_constants)
+    # state_out = poseidon2(state_in, MATRIX_FULL, MATRIX_PARTIAL, round_constants)
+
+    # for (i,val) in enumerate(state_in):
+    #     if i % t == 0:
+    #         print("vec![", end="")
+    #     to_bytes(val)
+    #     if i % t == t - 1:
+    #         print("],")
+    # print("];")
+
+    # for (i,val) in enumerate(state_out):
+    #     if i % t == 0:
+    #         print("vec![", end="")
+    #     to_bytes(val)
+    #     if i % t == t - 1:
+    #         print("],")
+    # print("];")
+
+print("}")
+print("#endif")
--- a/icicle/include/poseidon2/poseidon2.cuh
+++ b/icicle/include/poseidon2/poseidon2.cuh
@@ -0,0 +1,130 @@
+#pragma once
+#ifndef POSEIDON2_H
+#define POSEIDON2_H
+
+#include <cstdint>
+#include <stdexcept>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "utils/utils.h"
+
+/**
+ * @namespace poseidon2
+ * Implementation of the [Poseidon2 hash function](https://eprint.iacr.org/2019/458.pdf)
+ * Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
+ */
+namespace poseidon2 {
+  /**
+   * For most of the Poseidon2 configurations this is the case
+   */
+  const int EXTERNAL_ROUNDS_DEFAULT = 8;
+
+  enum DiffusionStrategy {
+    DEFAULT_DIFFUSION,
+    MONTGOMERY,
+  };
+
+  enum MdsType { DEFAULT_MDS, PLONKY };
+
+  enum PoseidonMode {
+    COMPRESSION,
+    PERMUTATION,
+  };
+
+  /**
+   * @struct Poseidon2Constants
+   * This constants are enough to define a Poseidon2 instantce
+   * @param round_constants A pointer to round constants allocated on the device
+   * @param mds_matrix A pointer to an mds matrix allocated on the device
+   * @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
+   * @param sparse_matrices A pointer to sparse matrices allocated on the device
+   */
+  template <typename S>
+  struct Poseidon2Constants {
+    int width;
+    int alpha;
+    int internal_rounds;
+    int external_rounds;
+    S* round_constants = nullptr;
+    S* internal_matrix_diag = nullptr;
+    MdsType mds_type;
+    DiffusionStrategy diffusion;
+  };
+
+  /**
+   * @struct Poseidon2Config
+   * Struct that encodes various Poseidon2 parameters.
+   */
+  struct Poseidon2Config {
+    device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
+    bool are_states_on_device;  /**< True if inputs are on device and false if they're on host. Default value: false. */
+    bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
+    PoseidonMode mode;
+    int output_index;
+    bool loop_state; /**< If true, hash results will also be copied in the input pointer in aligned format */
+    bool
+      is_async; /**< Whether to run the Poseidon2 asynchronously. If set to `true`, the poseidon_hash function will be
+                 *   non-blocking and you'd need to synchronize it explicitly by running
+                 *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
+                 *   function will block the current CPU thread. */
+  };
+
+  static Poseidon2Config default_poseidon2_config(
+    int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
+  {
+    Poseidon2Config config = {
+      ctx,   // ctx
+      false, // are_states_on_device
+      false, // are_outputs_on_device
+      PoseidonMode::COMPRESSION,
+      1,     // output_index
+      false, // loop_state
+      false, // is_async
+    };
+    return config;
+  }
+
+  template <typename S>
+  cudaError_t create_optimized_poseidon2_constants(
+    int width,
+    int alpha,
+    int internal_rounds,
+    int external_rounds,
+    const S* round_constants,
+    const S* internal_matrix_diag,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon_constants);
+
+  /**
+   * Loads pre-calculated optimized constants, moves them to the device
+   */
+  template <typename S>
+  cudaError_t init_optimized_poseidon2_constants(
+    int width,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* constants);
+
+  /**
+   * Compute the poseidon hash over a sequence of preimages.
+   * Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
+   * @param T size of the poseidon state, should be equal to {arity + 1}
+   * @param states a pointer to the input data. May be allocated on device or on host, regulated
+   * by the config. May point to a string of preimages or a string of states filled with preimages.
+   * @param output a pointer to the output data. May be allocated on device or on host, regulated
+   * by the config. Must be at least of size [number_of_states](@ref number_of_states)
+   * @param number_of_states number of input blocks of size T-1 (arity)
+   */
+  template <typename S, int T>
+  cudaError_t poseidon2_hash(
+    S* states,
+    S* output,
+    size_t number_of_states,
+    const Poseidon2Constants<S>& constants,
+    const Poseidon2Config& config);
+} // namespace poseidon2
+
+#endif
--- a/icicle/src/fields/CMakeLists.txt
+++ b/icicle/src/fields/CMakeLists.txt
@@ -3,6 +3,7 @@ if (EXT_FIELD)
 endif ()

 SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin)
+SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252)

 set(TARGET icicle_field)

@@ -27,6 +28,10 @@ if (DEFINED CURVE)
  list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
 endif()

+if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_POSEIDON2)
+  list(APPEND FIELD_SOURCE ${SRC}/poseidon2/extern.cu)
+endif()
+
 if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
  list(APPEND FIELD_SOURCE ${SRC}/ntt/extern.cu)
  list(APPEND FIELD_SOURCE ${SRC}/ntt/kernel_ntt.cu)
--- a/icicle/src/poseidon2/.gitignore
+++ b/icicle/src/poseidon2/.gitignore
@@ -0,0 +1,2 @@
+test_poseidon
+test_poseidon_release
--- a/icicle/src/poseidon2/Makefile
+++ b/icicle/src/poseidon2/Makefile
@@ -0,0 +1,7 @@
+test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
+	nvcc -o test_poseidon -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 -DDEVMODE -DDEBUG extern.cu test.cu
+	./test_poseidon
+
+test_poseidon_release: test.cu poseidon.cu kernels.cu constants.cu
+	nvcc -o test_poseidon_release -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 extern.cu test.cu
+	./test_poseidon_release
--- a/icicle/src/poseidon2/constants.cu
+++ b/icicle/src/poseidon2/constants.cu
@@ -0,0 +1,120 @@
+#include "poseidon2/poseidon2.cuh"
+
+/// These are pre-calculated constants for different curves
+#include "fields/id.h"
+#if FIELD_ID == BN254
+#include "poseidon2/constants/bn254_poseidon2.h"
+using namespace poseidon2_constants_bn254;
+#elif FIELD_ID == BLS12_381
+#include "poseidon2/constants/bls12_381_poseidon2.h"
+using namespace poseidon2_constants_bls12_381;
+#elif FIELD_ID == BLS12_377
+#include "poseidon2/constants/bls12_377_poseidon2.h"
+using namespace poseidon2_constants_bls12_377;
+#elif FIELD_ID == BW6_761
+#include "poseidon2/constants/bw6_761_poseidon2.h"
+using namespace poseidon2_constants_bw6_761;
+#elif FIELD_ID == GRUMPKIN
+#include "poseidon2/constants/grumpkin_poseidon2.h"
+using namespace poseidon2_constants_grumpkin;
+#elif FIELD_ID == BABY_BEAR
+#include "poseidon2/constants/babybear_poseidon2.h"
+using namespace poseidon2_constants_babybear;
+#endif
+
+namespace poseidon2 {
+  template <typename S>
+  cudaError_t create_optimized_poseidon2_constants(
+    int width,
+    int alpha,
+    int internal_rounds,
+    int external_rounds,
+    const S* round_constants,
+    const S* internal_matrix_diag,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon_constants)
+  {
+    cudaFree(nullptr); // Temporary solution
+    if (!(alpha == 3 || alpha == 5 || alpha == 7 || alpha == 11)) {
+      THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid alpha value");
+    }
+    if (external_rounds % 2) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid external rounds"); }
+
+    CHK_INIT_IF_RETURN();
+    cudaStream_t& stream = ctx.stream;
+
+    int round_constants_len = width * (external_rounds) + internal_rounds;
+    int internal_matrix_len = width;
+
+    // Malloc memory for copying round constants and internal matrix
+    S* d_constants;
+    CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * (internal_matrix_len + round_constants_len), stream));
+
+    S* d_internal_matrix = d_constants;
+    S* d_round_constants = d_constants + internal_matrix_len;
+
+    // Copy internal matrix
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      d_internal_matrix, internal_matrix_diag, sizeof(S) * internal_matrix_len, cudaMemcpyHostToDevice, stream));
+    // Copy round constants
+    CHK_IF_RETURN(cudaMemcpyAsync(
+      d_round_constants, round_constants, sizeof(S) * round_constants_len, cudaMemcpyHostToDevice, stream));
+
+    // Make sure all the constants have been copied
+    CHK_IF_RETURN(cudaStreamSynchronize(stream));
+    *poseidon_constants = {width,    alpha,    internal_rounds, external_rounds, d_round_constants, d_internal_matrix,
+                           mds_type, diffusion};
+
+    return CHK_LAST();
+  }
+
+  template <typename S>
+  cudaError_t init_optimized_poseidon2_constants(
+    int width,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<S>* poseidon2_constants)
+  {
+    cudaFree(nullptr); // Temporary solution
+    CHK_INIT_IF_RETURN();
+
+#define P2_CONSTANTS_DEF(width)                                                                                        \
+  case width:                                                                                                          \
+    internal_rounds = t##width::internal_rounds;                                                                       \
+    round_constants = t##width::round_constants;                                                                       \
+    internal_matrix = t##width::mat_diag_m_1;                                                                          \
+    alpha = t##width::alpha;                                                                                           \
+    break;
+
+    int alpha;
+    int external_rounds = EXTERNAL_ROUNDS_DEFAULT;
+    int internal_rounds;
+    unsigned char* round_constants;
+    unsigned char* internal_matrix;
+    switch (width) {
+      P2_CONSTANTS_DEF(2)
+      P2_CONSTANTS_DEF(3)
+      P2_CONSTANTS_DEF(4)
+      P2_CONSTANTS_DEF(8)
+      P2_CONSTANTS_DEF(12)
+      P2_CONSTANTS_DEF(16)
+      P2_CONSTANTS_DEF(20)
+      P2_CONSTANTS_DEF(24)
+    default:
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument,
+        "init_optimized_poseidon2_constants: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+    }
+    S* h_round_constants = reinterpret_cast<S*>(round_constants);
+    S* h_internal_matrix = reinterpret_cast<S*>(internal_matrix);
+
+    create_optimized_poseidon2_constants(
+      width, alpha, internal_rounds, external_rounds, h_round_constants, h_internal_matrix, mds_type, diffusion, ctx,
+      poseidon2_constants);
+
+    return CHK_LAST();
+  }
+} // namespace poseidon2
--- a/icicle/src/poseidon2/extern.cu
+++ b/icicle/src/poseidon2/extern.cu
@@ -0,0 +1,63 @@
+#include "utils/utils.h"
+
+#include "fields/field_config.cuh"
+using namespace field_config;
+
+#include "poseidon.cu"
+
+namespace poseidon2 {
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon2_constants_cuda)(
+    int width,
+    int alpha,
+    int internal_rounds,
+    int external_rounds,
+    const scalar_t* round_constants,
+    const scalar_t* internal_matrix_diag,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<scalar_t>* poseidon_constants)
+  {
+    return create_optimized_poseidon2_constants<scalar_t>(
+      width, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type, diffusion, ctx,
+      poseidon_constants);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon2_constants_cuda)(
+    int width,
+    MdsType mds_type,
+    DiffusionStrategy diffusion,
+    device_context::DeviceContext& ctx,
+    Poseidon2Constants<scalar_t>* constants)
+  {
+    return init_optimized_poseidon2_constants<scalar_t>(width, mds_type, diffusion, ctx, constants);
+  }
+
+  extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_hash_cuda)(
+    scalar_t* input,
+    scalar_t* output,
+    int number_of_states,
+    int width,
+    const Poseidon2Constants<scalar_t>* constants,
+    Poseidon2Config* config)
+  {
+#define P2_HASH_T(width)                                                                                               \
+  case width:                                                                                                          \
+    return poseidon2_hash<scalar_t, width>(input, output, number_of_states, *constants, *config);
+
+    switch (width) {
+      P2_HASH_T(2)
+      P2_HASH_T(3)
+      P2_HASH_T(4)
+      P2_HASH_T(8)
+      P2_HASH_T(12)
+      P2_HASH_T(16)
+      P2_HASH_T(20)
+      P2_HASH_T(24)
+    default:
+      THROW_ICICLE_ERR(
+        IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
+    }
+    return CHK_LAST();
+  }
+} // namespace poseidon2
--- a/icicle/src/poseidon2/kernels.cu
+++ b/icicle/src/poseidon2/kernels.cu
@@ -0,0 +1,233 @@
+#include "poseidon/poseidon.cuh"
+#include "gpu-utils/modifiers.cuh"
+
+namespace poseidon2 {
+  template <typename S>
+  DEVICE_INLINE S sbox_el(S element, const int alpha)
+  {
+    S result2 = S::sqr(element);
+    switch (alpha) {
+    case 3:
+      return result2 * element;
+    case 5:
+      return S::sqr(result2) * element;
+    case 7:
+      return S::sqr(result2) * result2 * element;
+    case 11:
+      S result8 = S::sqr(S::sqr(result2));
+      return result8 * result2 * element;
+    }
+  }
+
+  template <typename S, int T>
+  DEVICE_INLINE S sbox(S state[T], const int alpha)
+  {
+    for (int i = 0; i < T; i++) {
+      state[i] = sbox_el(state[i], alpha);
+    }
+  }
+
+  template <typename S, int T>
+  DEVICE_INLINE S add_rc(S state[T], size_t rc_offset, const S* rc)
+  {
+    for (int i = 0; i < T; i++) {
+      state[i] = state[i] + rc[rc_offset + i];
+    }
+  }
+
+  template <typename S>
+  __device__ S mds_light_4x4(S s[4])
+  {
+    S t0 = s[0] + s[1];
+    S t1 = s[2] + s[3];
+    S t2 = s[1] + s[1] + t1;
+    S t3 = s[3] + s[3] + t0;
+    S t4 = S::template mul_unsigned<4>(t1) + t3;
+    S t5 = S::template mul_unsigned<4>(t0) + t2;
+    s[0] = t3 + t5;
+    s[1] = t5;
+    s[2] = t2 + t4;
+    s[3] = t4;
+  }
+
+  // Multiply a 4-element vector x by:
+  // [ 2 3 1 1 ]
+  // [ 1 2 3 1 ]
+  // [ 1 1 2 3 ]
+  // [ 3 1 1 2 ].
+  // https://github.com/Plonky3/Plonky3/blob/main/poseidon2/src/matrix.rs#L36
+  template <typename S>
+  __device__ S mds_light_plonky_4x4(S s[4])
+  {
+    S t01 = s[0] + s[1];
+    S t23 = s[2] + s[3];
+    S t0123 = t01 + t23;
+    S t01123 = t0123 + s[1];
+    S t01233 = t0123 + s[3];
+    s[3] = t01233 + S::template mul_unsigned<2>(s[0]);
+    s[1] = t01123 + S::template mul_unsigned<2>(s[2]);
+    s[0] = t01123 + t01;
+    s[2] = t01233 + t23;
+  }
+
+  template <typename S, int T>
+  __device__ S mds_light(S state[T], MdsType mds)
+  {
+    S sum;
+    switch (T) {
+    case 2:
+      // Matrix circ(2, 1)
+      // [2, 1]
+      // [1, 2]
+      sum = state[0] + state[1];
+      state[0] = state[0] + sum;
+      state[1] = state[1] + sum;
+      break;
+    case 3:
+      // Matrix circ(2, 1, 1)
+      // [2, 1, 1]
+      // [1, 2, 1]
+      // [1, 1, 2]
+      sum = state[0] + state[1] + state[2];
+      state[0] = state[0] + sum;
+      state[1] = state[1] + sum;
+      state[2] = state[2] + sum;
+      break;
+    case 4:
+    case 8:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+      for (int i = 0; i < T; i += 4) {
+        switch (mds) {
+        case MdsType::DEFAULT_MDS:
+          mds_light_4x4(&state[i]);
+          break;
+        case MdsType::PLONKY:
+          mds_light_plonky_4x4(&state[i]);
+        }
+      }
+
+      S sums[4] = {state[0], state[1], state[2], state[3]};
+      for (int i = 4; i < T; i += 4) {
+        sums[0] = sums[0] + state[i];
+        sums[1] = sums[1] + state[i + 1];
+        sums[2] = sums[2] + state[i + 2];
+        sums[3] = sums[3] + state[i + 3];
+      }
+      for (int i = 0; i < T; i++) {
+        state[i] = state[i] + sums[i % 4];
+      }
+      break;
+    }
+  }
+
+  template <typename S, int T>
+  __device__ S internal_round(S state[T], size_t rc_offset, const Poseidon2Constants<S>& constants)
+  {
+    S element = state[0];
+    element = element + constants.round_constants[rc_offset];
+    element = sbox_el<S>(element, constants.alpha);
+
+    S sum;
+    switch (T) {
+    case 2:
+      // [2, 1]
+      // [1, 3]
+      sum = element + state[1];
+      state[0] = element + sum;
+      state[1] = S::template mul_unsigned<2>(state[1]) + sum;
+      break;
+    case 3:
+      // [2, 1, 1]
+      // [1, 2, 1]
+      // [1, 1, 3]
+      sum = element + state[1] + state[2];
+      state[0] = element + sum;
+      state[1] = state[1] + sum;
+      state[2] = S::template mul_unsigned<2>(state[2]) + sum;
+      break;
+    case 4:
+    case 8:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+      typename S::Wide wide_sum = S::Wide::from_field(element);
+      for (int i = 1; i < T; i++) {
+        wide_sum = wide_sum + S::Wide::from_field(state[i]);
+      }
+      sum = S::reduce(wide_sum);
+      switch (constants.diffusion) {
+      case DiffusionStrategy::DEFAULT_DIFFUSION:
+        state[0] = element * constants.internal_matrix_diag[0] + sum;
+        for (int i = 1; i < T; i++) {
+          state[i] = state[i] * constants.internal_matrix_diag[i] + sum;
+        }
+        break;
+      case DiffusionStrategy::MONTGOMERY:
+        state[0] = S::from_montgomery(element * constants.internal_matrix_diag[0] + sum);
+        for (int i = 1; i < T; i++) {
+          state[i] = S::from_montgomery(state[i] * constants.internal_matrix_diag[i] + sum);
+        }
+        break;
+      }
+    }
+  }
+
+  template <typename S, int T>
+  __global__ void
+  poseidon2_permutation_kernel(S* states, S* states_out, size_t number_of_states, const Poseidon2Constants<S> constants)
+  {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= number_of_states) { return; }
+
+    S state[T];
+    UNROLL
+    for (int i = 0; i < T; i++) {
+      state[i] = states[idx * T + i];
+    }
+    unsigned int rn;
+
+    mds_light<S, T>(state, constants.mds_type);
+
+    size_t rc_offset = 0;
+    // External rounds
+    for (rn = 0; rn < constants.external_rounds / 2; rn++) {
+      add_rc<S, T>(state, rc_offset, constants.round_constants);
+      sbox<S, T>(state, constants.alpha);
+      mds_light<S, T>(state, constants.mds_type);
+      rc_offset += T;
+    }
+
+    // Internal rounds
+    for (; rn < constants.external_rounds / 2 + constants.internal_rounds; rn++) {
+      internal_round<S, T>(state, rc_offset, constants);
+      rc_offset++;
+    }
+
+    // External rounds
+    for (; rn < constants.external_rounds + constants.internal_rounds; rn++) {
+      add_rc<S, T>(state, rc_offset, constants.round_constants);
+      sbox<S, T>(state, constants.alpha);
+      mds_light<S, T>(state, constants.mds_type);
+      rc_offset += T;
+    }
+
+    UNROLL
+    for (int i = 0; i < T; i++) {
+      states_out[idx * T + i] = state[i];
+    }
+  }
+
+  // These function is just doing copy from the states to the output
+  template <typename S, int T>
+  __global__ void get_hash_results(S* states, size_t number_of_states, int index, S* out)
+  {
+    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (idx >= number_of_states) { return; }
+
+    out[idx] = states[idx * T + index];
+  }
+} // namespace poseidon2
--- a/icicle/src/poseidon2/poseidon.cu
+++ b/icicle/src/poseidon2/poseidon.cu
@@ -0,0 +1,76 @@
+#include "poseidon2/poseidon2.cuh"
+#include "constants.cu"
+#include "kernels.cu"
+
+namespace poseidon2 {
+  static int poseidon_block_size = 128;
+
+  template <typename S, int T>
+  int poseidon_number_of_blocks(size_t number_of_states)
+  {
+    return number_of_states / poseidon_block_size + static_cast<bool>(number_of_states % poseidon_block_size);
+  }
+
+  template <typename S, int T>
+  cudaError_t permute_many(
+    S* states, S* states_out, size_t number_of_states, const Poseidon2Constants<S>& constants, cudaStream_t& stream)
+  {
+    poseidon2_permutation_kernel<S, T>
+      <<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
+        states, states_out, number_of_states, constants);
+    CHK_IF_RETURN(cudaPeekAtLastError());
+    return CHK_LAST();
+  }
+
+  template <typename S, int T>
+  cudaError_t poseidon2_hash(
+    S* states,
+    S* output,
+    size_t number_of_states,
+    const Poseidon2Constants<S>& constants,
+    const Poseidon2Config& config)
+  {
+    CHK_INIT_IF_RETURN();
+    cudaStream_t& stream = config.ctx.stream;
+    S* d_states;
+    if (config.are_states_on_device) {
+      d_states = states;
+    } else {
+      // allocate memory for {number_of_states} states of {t} scalars each
+      CHK_IF_RETURN(cudaMallocAsync(&d_states, number_of_states * T * sizeof(S), stream))
+      CHK_IF_RETURN(cudaMemcpyAsync(d_states, states, number_of_states * T * sizeof(S), cudaMemcpyHostToDevice, stream))
+    }
+
+    cudaError_t hash_error = permute_many<S, T>(d_states, d_states, number_of_states, constants, stream);
+    CHK_IF_RETURN(hash_error);
+
+    if (config.mode == PoseidonMode::COMPRESSION) {
+      S* output_device;
+      if (config.are_outputs_on_device) {
+        output_device = output;
+      } else {
+        CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_states * sizeof(S), stream))
+      }
+
+      get_hash_results<S, T><<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
+        d_states, number_of_states, config.output_index, output_device);
+      CHK_IF_RETURN(cudaPeekAtLastError());
+
+      if (!config.are_outputs_on_device) {
+        CHK_IF_RETURN(
+          cudaMemcpyAsync(output, output_device, number_of_states * sizeof(S), cudaMemcpyDeviceToHost, stream));
+        CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
+      }
+    } else {
+      if (!config.are_states_on_device || !config.are_outputs_on_device) {
+        CHK_IF_RETURN(
+          cudaMemcpyAsync(output, d_states, number_of_states * T * sizeof(S), cudaMemcpyDeviceToHost, stream));
+      }
+    }
+
+    if (!config.are_states_on_device) CHK_IF_RETURN(cudaFreeAsync(d_states, stream));
+
+    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
+    return CHK_LAST();
+  }
+} // namespace poseidon2
--- a/icicle/src/poseidon2/test.cu
+++ b/icicle/src/poseidon2/test.cu
--- a/wrappers/rust/icicle-core/Cargo.toml
+++ b/wrappers/rust/icicle-core/Cargo.toml
@@ -11,14 +11,13 @@ repository.workspace = true
 [dependencies]

 icicle-cuda-runtime = { workspace = true }
-
 ark-ff = { version = "0.4.0", optional = true }
 ark-ec = { version = "0.4.0", optional = true, features = ["parallel"] }
 ark-poly = { version = "0.4.0", optional = true }
 ark-std = { version = "0.4.0", optional = true }

 rayon = "1.8.1"
-
+hex = "0.4"
 criterion = "0.3"

 [dev-dependencies]
--- a/wrappers/rust/icicle-core/src/field.rs
+++ b/wrappers/rust/icicle-core/src/field.rs
@@ -3,6 +3,7 @@ use crate::traits::ArkConvertible;
 use crate::traits::{FieldConfig, FieldImpl, MontgomeryConvertible};
 #[cfg(feature = "arkworks")]
 use ark_ff::{BigInteger, Field as ArkField, PrimeField};
+use hex::FromHex;
 use icicle_cuda_runtime::device_context::DeviceContext;
 use icicle_cuda_runtime::error::CudaError;
 use icicle_cuda_runtime::memory::DeviceSlice;
@@ -82,6 +83,12 @@ impl<const NUM_LIMBS: usize, F: FieldConfig> FieldImpl for Field<NUM_LIMBS, F> {
        Self::from(limbs)
    }

+    fn from_hex(s: &str) -> Self {
+        let mut bytes = Vec::from_hex(if s.starts_with("0x") { &s[2..] } else { s }).expect("Invalid hex string");
+        bytes.reverse();
+        Self::from_bytes_le(&bytes)
+    }
+
    fn zero() -> Self {
        FieldImpl::from_u32(0)
    }
--- a/wrappers/rust/icicle-core/src/lib.rs
+++ b/wrappers/rust/icicle-core/src/lib.rs
@@ -6,6 +6,7 @@ pub mod msm;
 pub mod ntt;
 pub mod polynomials;
 pub mod poseidon;
+pub mod poseidon2;
 #[doc(hidden)]
 pub mod tests;
 pub mod traits;
--- a/wrappers/rust/icicle-core/src/poseidon/tests.rs
+++ b/wrappers/rust/icicle-core/src/poseidon/tests.rs
@@ -56,10 +56,11 @@ pub fn check_poseidon_hash_many<F: FieldImpl>()
 where
    <F as FieldImpl>::Config: Poseidon<F>,
 {
-    let arity = 2u32;
-    let constants = init_poseidon::<F>(arity as u32);
+    for arity in [2, 4] {
+        let constants = init_poseidon::<F>(arity as u32);

-    _check_poseidon_hash_many(constants);
+        _check_poseidon_hash_many(constants);
+    }
 }

 pub fn check_poseidon_custom_config<F: FieldImpl>(field_bytes: usize, field_prefix: &str, partial_rounds: u32)
--- a/wrappers/rust/icicle-core/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-core/src/poseidon2/mod.rs
@@ -0,0 +1,384 @@
+#[doc(hidden)]
+pub mod tests;
+
+use icicle_cuda_runtime::{
+    device::check_device,
+    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
+    memory::{DeviceSlice, HostOrDeviceSlice},
+};
+
+use crate::{error::IcicleResult, traits::FieldImpl};
+
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub enum DiffusionStrategy {
+    Default,
+    Montgomery,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub enum MdsType {
+    Default,
+    Plonky,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub enum PoseidonMode {
+    Compression,
+    Permutation,
+}
+
+#[repr(C)]
+pub struct Poseidon2Constants<'a, F: FieldImpl> {
+    width: u32,
+
+    alpha: u32,
+
+    internal_rounds: u32,
+
+    external_rounds: u32,
+
+    round_constants: &'a DeviceSlice<F>,
+
+    inernal_matrix_diag: &'a DeviceSlice<F>,
+
+    pub mds_type: MdsType,
+
+    pub diffusion: DiffusionStrategy,
+}
+
+impl<F: FieldImpl> std::fmt::Debug for Poseidon2Constants<'_, F> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}, {}, {}, {}",
+            self.width, self.alpha, self.internal_rounds, self.external_rounds
+        )
+    }
+}
+
+/// Struct that encodes Poseidon parameters to be passed into the [poseidon_hash_many](poseidon_hash_many) function.
+#[repr(C)]
+#[derive(Debug, Clone)]
+pub struct Poseidon2Config<'a> {
+    /// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
+    pub ctx: DeviceContext<'a>,
+
+    are_states_on_device: bool,
+
+    are_outputs_on_device: bool,
+
+    pub mode: PoseidonMode,
+
+    pub output_index: u32,
+
+    /// If true, hash results will also be copied in the input pointer in aligned format
+    pub loop_state: bool,
+
+    /// Whether to run Poseidon asynchronously. If set to `true`, Poseidon will be non-blocking
+    /// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+    /// If set to `false`, Poseidon will block the current CPU thread.
+    pub is_async: bool,
+}
+
+impl<'a> Default for Poseidon2Config<'a> {
+    fn default() -> Self {
+        Self::default_for_device(DEFAULT_DEVICE_ID)
+    }
+}
+
+impl<'a> Poseidon2Config<'a> {
+    pub fn default_for_device(device_id: usize) -> Self {
+        Self {
+            ctx: DeviceContext::default_for_device(device_id),
+            are_states_on_device: false,
+            are_outputs_on_device: false,
+            mode: PoseidonMode::Compression,
+            output_index: 1,
+            loop_state: false,
+            is_async: false,
+        }
+    }
+}
+
+pub trait Poseidon2<F: FieldImpl> {
+    fn create_optimized_constants<'a>(
+        width: u32,
+        alpha: u32,
+        internal_rounds: u32,
+        external_rounds: u32,
+        round_constants: &mut [F],
+        internal_matrix_diag: &mut [F],
+        mds_type: MdsType,
+        diffusion: DiffusionStrategy,
+        ctx: &DeviceContext,
+    ) -> IcicleResult<Poseidon2Constants<'a, F>>;
+    fn load_optimized_constants<'a>(
+        width: u32,
+        mds_type: MdsType,
+        diffusion: DiffusionStrategy,
+        ctx: &DeviceContext,
+    ) -> IcicleResult<Poseidon2Constants<'a, F>>;
+    fn poseidon_unchecked(
+        states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        number_of_states: u32,
+        width: u32,
+        constants: &Poseidon2Constants<F>,
+        config: &Poseidon2Config,
+    ) -> IcicleResult<()>;
+}
+
+/// Loads pre-calculated poseidon constants on the GPU.
+pub fn load_optimized_poseidon2_constants<'a, F>(
+    width: u32,
+    mds_type: MdsType,
+    diffusion: DiffusionStrategy,
+    ctx: &DeviceContext,
+) -> IcicleResult<Poseidon2Constants<'a, F>>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    <<F as FieldImpl>::Config as Poseidon2<F>>::load_optimized_constants(width, mds_type, diffusion, ctx)
+}
+
+/// Creates new instance of poseidon constants on the GPU.
+pub fn create_optimized_poseidon2_constants<'a, F>(
+    width: u32,
+    alpha: u32,
+    ctx: &DeviceContext,
+    internal_rounds: u32,
+    external_rounds: u32,
+    round_constants: &mut [F],
+    internal_matrix_diag: &mut [F],
+    mds_type: MdsType,
+    diffusion: DiffusionStrategy,
+) -> IcicleResult<Poseidon2Constants<'a, F>>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    <<F as FieldImpl>::Config as Poseidon2<F>>::create_optimized_constants(
+        width,
+        alpha,
+        internal_rounds,
+        external_rounds,
+        round_constants,
+        internal_matrix_diag,
+        mds_type,
+        diffusion,
+        ctx,
+    )
+}
+
+/// Computes the poseidon hashes for multiple preimages.
+///
+/// # Arguments
+///
+/// * `input` - a pointer to the input data. May point to a vector of preimages or a vector of states filled with preimages.
+///
+/// * `output` - a pointer to the output data. Must be at least of size [number_of_states](number_of_states)
+///
+/// * `number_of_states` - number of input blocks of size `arity`
+///
+/// * `arity` - the arity of the hash function (the size of 1 preimage)
+///
+/// * `constants` - Poseidon constants.
+///
+/// * `config` - config used to specify extra arguments of the Poseidon.
+pub fn poseidon_hash_many<F>(
+    states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    number_of_states: u32,
+    width: u32,
+    constants: &Poseidon2Constants<F>,
+    config: &Poseidon2Config,
+) -> IcicleResult<()>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    if states.len() < (number_of_states * width) as usize {
+        panic!(
+            "input len is {}; but needs to be at least {}",
+            states.len(),
+            number_of_states * width
+        );
+    }
+
+    if output.len() < number_of_states as usize {
+        panic!(
+            "output len is {}; but needs to be at least {}",
+            output.len(),
+            number_of_states
+        );
+    }
+
+    let ctx_device_id = config
+        .ctx
+        .device_id;
+    if let Some(device_id) = states.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in input and context are different"
+        );
+    }
+    if let Some(device_id) = output.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in output and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+    let mut local_cfg = config.clone();
+    local_cfg.are_states_on_device = states.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
+
+    <<F as FieldImpl>::Config as Poseidon2<F>>::poseidon_unchecked(
+        states,
+        output,
+        number_of_states,
+        width,
+        constants,
+        &local_cfg,
+    )
+}
+
+#[macro_export]
+macro_rules! impl_poseidon2 {
+    (
+      $field_prefix:literal,
+      $field_prefix_ident:ident,
+      $field:ident,
+      $field_config:ident
+    ) => {
+        mod $field_prefix_ident {
+            use crate::poseidon2::{
+                $field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Config,
+                Poseidon2Constants,
+            };
+            extern "C" {
+                #[link_name = concat!($field_prefix, "_create_optimized_poseidon2_constants_cuda")]
+                pub(crate) fn _create_optimized_constants(
+                    width: u32,
+                    alpha: u32,
+                    internal_rounds: u32,
+                    external_rounds: u32,
+                    constants: *mut $field,
+                    internal_matrix_diag: *mut $field,
+                    mds_type: MdsType,
+                    diffusion: DiffusionStrategy,
+                    ctx: &DeviceContext,
+                    poseidon_constants: *mut Poseidon2Constants<$field>,
+                ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "_init_optimized_poseidon2_constants_cuda")]
+                pub(crate) fn _load_optimized_constants(
+                    width: u32,
+                    mds_type: MdsType,
+                    diffusion: DiffusionStrategy,
+                    ctx: &DeviceContext,
+                    constants: *mut Poseidon2Constants<$field>,
+                ) -> CudaError;
+
+                #[link_name = concat!($field_prefix, "_poseidon2_hash_cuda")]
+                pub(crate) fn hash_many(
+                    states: *mut $field,
+                    output: *mut $field,
+                    number_of_states: u32,
+                    width: u32,
+                    constants: &Poseidon2Constants<$field>,
+                    config: &Poseidon2Config,
+                ) -> CudaError;
+            }
+        }
+
+        impl Poseidon2<$field> for $field_config {
+            fn create_optimized_constants<'a>(
+                width: u32,
+                alpha: u32,
+                internal_rounds: u32,
+                external_rounds: u32,
+                round_constants: &mut [$field],
+                internal_matrix_diag: &mut [$field],
+                mds_type: MdsType,
+                diffusion: DiffusionStrategy,
+                ctx: &DeviceContext,
+            ) -> IcicleResult<Poseidon2Constants<'a, $field>> {
+                unsafe {
+                    let mut poseidon_constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
+                    let err = $field_prefix_ident::_create_optimized_constants(
+                        width,
+                        alpha,
+                        internal_rounds,
+                        external_rounds,
+                        round_constants as *mut _ as *mut $field,
+                        internal_matrix_diag as *mut _ as *mut $field,
+                        mds_type,
+                        diffusion,
+                        ctx,
+                        poseidon_constants.as_mut_ptr(),
+                    )
+                    .wrap();
+                    err.and(Ok(poseidon_constants.assume_init()))
+                }
+            }
+
+            fn load_optimized_constants<'a>(
+                width: u32,
+                mds_type: MdsType,
+                diffusion: DiffusionStrategy,
+                ctx: &DeviceContext,
+            ) -> IcicleResult<Poseidon2Constants<'a, $field>> {
+                unsafe {
+                    let mut constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
+                    let err = $field_prefix_ident::_load_optimized_constants(
+                        width,
+                        mds_type,
+                        diffusion,
+                        ctx,
+                        constants.as_mut_ptr(),
+                    )
+                    .wrap();
+                    err.and(Ok(constants.assume_init()))
+                }
+            }
+
+            fn poseidon_unchecked(
+                states: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                number_of_states: u32,
+                width: u32,
+                constants: &Poseidon2Constants<$field>,
+                config: &Poseidon2Config,
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::hash_many(
+                        states.as_mut_ptr(),
+                        output.as_mut_ptr(),
+                        number_of_states,
+                        width,
+                        constants,
+                        config,
+                    )
+                    .wrap()
+                }
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! impl_poseidon2_tests {
+    (
+      $field:ident
+    ) => {
+        #[test]
+        fn test_poseidon2_hash_many() {
+            check_poseidon_hash_many::<$field>()
+        }
+    };
+}
--- a/wrappers/rust/icicle-core/src/poseidon2/tests.rs
+++ b/wrappers/rust/icicle-core/src/poseidon2/tests.rs
@@ -0,0 +1,105 @@
+use crate::poseidon2::{MdsType, PoseidonMode};
+use crate::traits::FieldImpl;
+use icicle_cuda_runtime::device_context::DeviceContext;
+use icicle_cuda_runtime::memory::{HostOrDeviceSlice, HostSlice};
+
+use super::{
+    load_optimized_poseidon2_constants, poseidon_hash_many, DiffusionStrategy, Poseidon2, Poseidon2Config,
+    Poseidon2Constants,
+};
+
+pub fn init_poseidon<'a, F: FieldImpl>(
+    width: u32,
+    mds_type: MdsType,
+    diffusion: DiffusionStrategy,
+) -> Poseidon2Constants<'a, F>
+where
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    let ctx = DeviceContext::default();
+    load_optimized_poseidon2_constants::<F>(width, mds_type, diffusion, &ctx).unwrap()
+}
+
+fn _check_poseidon_hash_many<F: FieldImpl>(width: u32, constants: Poseidon2Constants<F>) -> (F, F)
+where
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    let test_size = 1 << 10;
+    let mut inputs = vec![F::one(); test_size * width as usize];
+    let mut outputs = vec![F::zero(); test_size];
+
+    let input_slice = HostSlice::from_mut_slice(&mut inputs);
+    let output_slice = HostSlice::from_mut_slice(&mut outputs);
+
+    let config = Poseidon2Config::default();
+    poseidon_hash_many::<F>(
+        input_slice,
+        output_slice,
+        test_size as u32,
+        width as u32,
+        &constants,
+        &config,
+    )
+    .unwrap();
+
+    let a1 = output_slice[0];
+    let a2 = output_slice[output_slice.len() - 2];
+
+    assert_eq!(a1, a2);
+
+    (a1, a2)
+}
+
+pub fn check_poseidon_hash_many<'a, F: FieldImpl + 'a>()
+where
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    let widths = [2, 3, 4, 8, 12, 16, 20, 24];
+    for width in widths {
+        let constants = init_poseidon::<'a, F>(width as u32, MdsType::Default, DiffusionStrategy::Default);
+
+        _check_poseidon_hash_many(width, constants);
+    }
+}
+
+pub fn check_poseidon_kats<'a, F: FieldImpl>(width: usize, kats: &[F], constants: &Poseidon2Constants<'a, F>)
+where
+    <F as FieldImpl>::Config: Poseidon2<F>,
+{
+    assert_eq!(width, kats.len());
+
+    let batch_size = 1024;
+    let mut input = vec![F::one(); width];
+    let mut outputs = vec![F::zero(); width * batch_size];
+
+    for i in 0..width {
+        input[i] = F::from_u32(i as u32);
+    }
+
+    let mut inputs: Vec<F> = std::iter::repeat(input.clone())
+        .take(batch_size)
+        .flatten()
+        .collect();
+
+    let input_slice = HostSlice::from_mut_slice(&mut inputs);
+    let output_slice = HostSlice::from_mut_slice(&mut outputs);
+
+    let mut config = Poseidon2Config::default();
+    config.mode = PoseidonMode::Permutation;
+    poseidon_hash_many::<F>(
+        input_slice,
+        output_slice,
+        batch_size as u32,
+        width as u32,
+        &constants,
+        &config,
+    )
+    .unwrap();
+
+    for (i, val) in output_slice
+        .iter()
+        .enumerate()
+    {
+        assert_eq!(*val, kats[i % width]);
+    }
+}
--- a/wrappers/rust/icicle-core/src/traits.rs
+++ b/wrappers/rust/icicle-core/src/traits.rs
@@ -27,6 +27,7 @@ pub trait FieldImpl:

    fn to_bytes_le(&self) -> Vec<u8>;
    fn from_bytes_le(bytes: &[u8]) -> Self;
+    fn from_hex(s: &str) -> Self;
    fn zero() -> Self;
    fn one() -> Self;
    fn from_u32(val: u32) -> Self;
--- a/wrappers/rust/icicle-curves/icicle-bn254/src/lib.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/lib.rs
@@ -1,9 +1,11 @@
 pub mod curve;
 pub mod ecntt;
+
 pub mod msm;
 pub mod ntt;
 pub mod polynomials;
 pub mod poseidon;
+pub mod poseidon2;
 pub mod tree;
 pub mod vec_ops;

--- a/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/src/poseidon2/mod.rs
@@ -0,0 +1,35 @@
+use crate::curve::{ScalarCfg, ScalarField};
+
+use icicle_core::error::IcicleResult;
+use icicle_core::impl_poseidon2;
+use icicle_core::poseidon2::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Config, Poseidon2Constants};
+use icicle_core::traits::IcicleResultWrap;
+use icicle_cuda_runtime::device_context::DeviceContext;
+use icicle_cuda_runtime::error::CudaError;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+
+use core::mem::MaybeUninit;
+
+impl_poseidon2!("bn254", bn254, ScalarField, ScalarCfg);
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use crate::curve::ScalarField;
+    use icicle_core::impl_poseidon2_tests;
+    use icicle_core::ntt::FieldImpl;
+    use icicle_core::poseidon2::{tests::*, DiffusionStrategy, MdsType};
+
+    impl_poseidon2_tests!(ScalarField);
+
+    #[test]
+    fn test_poseidon2_kats() {
+        let kats = [
+            ScalarField::from_hex("0x0bb61d24daca55eebcb1929a82650f328134334da98ea4f847f760054f4a3033"),
+            ScalarField::from_hex("0x303b6f7c86d043bfcbcc80214f26a30277a15d3f74ca654992defe7ff8d03570"),
+            ScalarField::from_hex("0x1ed25194542b12eef8617361c3ba7c52e660b145994427cc86296242cf766ec8"),
+        ];
+
+        let constants = init_poseidon::<ScalarField>(3, MdsType::Default, DiffusionStrategy::Default);
+        check_poseidon_kats(3, &kats, &constants);
+    }
+}
--- a/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml
+++ b/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml
@@ -17,10 +17,13 @@ cmake = "0.1.50"
 [dev-dependencies]
 risc0-core = "0.21.0"
 risc0-zkp = "0.21.0"
-p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3", rev = "83590121c8c28011cffa7e73cb71cf9bf94b8477" }
-p3-field = { git = "https://github.com/Plonky3/Plonky3", rev = "83590121c8c28011cffa7e73cb71cf9bf94b8477" }
-p3-dft = { git = "https://github.com/Plonky3/Plonky3", rev = "83590121c8c28011cffa7e73cb71cf9bf94b8477" }
-p3-matrix = { git = "https://github.com/Plonky3/Plonky3", rev = "83590121c8c28011cffa7e73cb71cf9bf94b8477" }
+p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-symmetric = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-mds = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-poseidon2 = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-field = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-dft = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
+p3-matrix = { git = "https://github.com/Plonky3/Plonky3", rev = "1e87146ebfaedc2150b635b10a096b733795fdce" }
 serial_test = "3.0.0"

 [features]
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/lib.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod field;
 pub mod ntt;
 pub mod polynomials;
+pub mod poseidon2;
 pub mod vec_ops;
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/ntt/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/ntt/mod.rs
@@ -120,7 +120,7 @@ pub(crate) mod tests {
            ntt_cfg.columns_batch = true;
            ntt_inplace(HostSlice::from_mut_slice(&mut scalars[..]), NTTDir::kForward, &ntt_cfg).unwrap();

-            let result_p3 = Radix2Dit.dft_batch(matrix_p3);
+            let result_p3 = Radix2Dit::default().dft_batch(matrix_p3);

            for i in 0..nrows {
                for j in 0..ntt_size {
@@ -156,7 +156,7 @@ pub(crate) mod tests {
            )
            .unwrap();

-            let ext_result_p3 = Radix2Dit.dft_batch(ext_matrix_p3);
+            let ext_result_p3 = Radix2Dit::default().dft_batch(ext_matrix_p3);

            for i in 0..nrows {
                for j in 0..ntt_size {
--- a/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/src/poseidon2/mod.rs
@@ -0,0 +1,633 @@
+use crate::field::{ScalarCfg, ScalarField};
+
+use icicle_core::error::IcicleResult;
+use icicle_core::impl_poseidon2;
+use icicle_core::poseidon2::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Config, Poseidon2Constants};
+use icicle_core::traits::IcicleResultWrap;
+use icicle_cuda_runtime::device_context::DeviceContext;
+use icicle_cuda_runtime::error::CudaError;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+
+use core::mem::MaybeUninit;
+
+impl_poseidon2!("babybear", babybear, ScalarField, ScalarCfg);
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use crate::field::ScalarField;
+    use icicle_core::impl_poseidon2_tests;
+    use icicle_core::poseidon2::{create_optimized_poseidon2_constants, tests::*, DiffusionStrategy, MdsType};
+    use icicle_core::traits::FieldImpl;
+    use icicle_cuda_runtime::device_context::DeviceContext;
+
+    use p3_baby_bear::BabyBear;
+    use p3_baby_bear::DiffusionMatrixBabyBear;
+    use p3_field::{AbstractField, PrimeField32};
+    use p3_poseidon2::{Poseidon2, Poseidon2ExternalMatrixGeneral};
+    use p3_symmetric::Permutation;
+
+    impl_poseidon2_tests!(ScalarField);
+
+    #[test]
+    fn test_poseidon2_kats() {
+        let kats = [
+            ScalarField::from_hex("0x2ed3e23d"),
+            ScalarField::from_hex("0x12921fb0"),
+            ScalarField::from_hex("0x0e659e79"),
+            ScalarField::from_hex("0x61d81dc9"),
+            ScalarField::from_hex("0x32bae33b"),
+            ScalarField::from_hex("0x62486ae3"),
+            ScalarField::from_hex("0x1e681b60"),
+            ScalarField::from_hex("0x24b91325"),
+            ScalarField::from_hex("0x2a2ef5b9"),
+            ScalarField::from_hex("0x50e8593e"),
+            ScalarField::from_hex("0x5bc818ec"),
+            ScalarField::from_hex("0x10691997"),
+            ScalarField::from_hex("0x35a14520"),
+            ScalarField::from_hex("0x2ba6a3c5"),
+            ScalarField::from_hex("0x279d47ec"),
+            ScalarField::from_hex("0x55014e81"),
+            ScalarField::from_hex("0x5953a67f"),
+            ScalarField::from_hex("0x2f403111"),
+            ScalarField::from_hex("0x6b8828ff"),
+            ScalarField::from_hex("0x1801301f"),
+            ScalarField::from_hex("0x2749207a"),
+            ScalarField::from_hex("0x3dc9cf21"),
+            ScalarField::from_hex("0x3c985ba2"),
+            ScalarField::from_hex("0x57a99864"),
+        ];
+
+        let constants = init_poseidon::<ScalarField>(24, MdsType::Default, DiffusionStrategy::Default);
+        check_poseidon_kats(24, &kats, &constants);
+    }
+
+    #[test]
+    fn test_poseidon2_plonky3_t16() {
+        let rounds_p = 13;
+        let rounds_f = 8;
+        const ALPHA: u64 = 7;
+        const WIDTH: usize = 16;
+
+        let cnv = BabyBear::from_canonical_u32;
+        let external_constants: Vec<[BabyBear; 16]> = vec![
+            [
+                cnv(1774958255),
+                cnv(1185780729),
+                cnv(1621102414),
+                cnv(1796380621),
+                cnv(588815102),
+                cnv(1932426223),
+                cnv(1925334750),
+                cnv(747903232),
+                cnv(89648862),
+                cnv(360728943),
+                cnv(977184635),
+                cnv(1425273457),
+                cnv(256487465),
+                cnv(1200041953),
+                cnv(572403254),
+                cnv(448208942),
+            ],
+            [
+                cnv(1215789478),
+                cnv(944884184),
+                cnv(953948096),
+                cnv(547326025),
+                cnv(646827752),
+                cnv(889997530),
+                cnv(1536873262),
+                cnv(86189867),
+                cnv(1065944411),
+                cnv(32019634),
+                cnv(333311454),
+                cnv(456061748),
+                cnv(1963448500),
+                cnv(1827584334),
+                cnv(1391160226),
+                cnv(1348741381),
+            ],
+            [
+                cnv(88424255),
+                cnv(104111868),
+                cnv(1763866748),
+                cnv(79691676),
+                cnv(1988915530),
+                cnv(1050669594),
+                cnv(359890076),
+                cnv(573163527),
+                cnv(222820492),
+                cnv(159256268),
+                cnv(669703072),
+                cnv(763177444),
+                cnv(889367200),
+                cnv(256335831),
+                cnv(704371273),
+                cnv(25886717),
+            ],
+            [
+                cnv(51754520),
+                cnv(1833211857),
+                cnv(454499742),
+                cnv(1384520381),
+                cnv(777848065),
+                cnv(1053320300),
+                cnv(1851729162),
+                cnv(344647910),
+                cnv(401996362),
+                cnv(1046925956),
+                cnv(5351995),
+                cnv(1212119315),
+                cnv(754867989),
+                cnv(36972490),
+                cnv(751272725),
+                cnv(506915399),
+            ],
+            [
+                cnv(1922082829),
+                cnv(1870549801),
+                cnv(1502529704),
+                cnv(1990744480),
+                cnv(1700391016),
+                cnv(1702593455),
+                cnv(321330495),
+                cnv(528965731),
+                cnv(183414327),
+                cnv(1886297254),
+                cnv(1178602734),
+                cnv(1923111974),
+                cnv(744004766),
+                cnv(549271463),
+                cnv(1781349648),
+                cnv(542259047),
+            ],
+            [
+                cnv(1536158148),
+                cnv(715456982),
+                cnv(503426110),
+                cnv(340311124),
+                cnv(1558555932),
+                cnv(1226350925),
+                cnv(742828095),
+                cnv(1338992758),
+                cnv(1641600456),
+                cnv(1843351545),
+                cnv(301835475),
+                cnv(43203215),
+                cnv(386838401),
+                cnv(1520185679),
+                cnv(1235297680),
+                cnv(904680097),
+            ],
+            [
+                cnv(1491801617),
+                cnv(1581784677),
+                cnv(913384905),
+                cnv(247083962),
+                cnv(532844013),
+                cnv(107190701),
+                cnv(213827818),
+                cnv(1979521776),
+                cnv(1358282574),
+                cnv(1681743681),
+                cnv(1867507480),
+                cnv(1530706910),
+                cnv(507181886),
+                cnv(695185447),
+                cnv(1172395131),
+                cnv(1250800299),
+            ],
+            [
+                cnv(1503161625),
+                cnv(817684387),
+                cnv(498481458),
+                cnv(494676004),
+                cnv(1404253825),
+                cnv(108246855),
+                cnv(59414691),
+                cnv(744214112),
+                cnv(890862029),
+                cnv(1342765939),
+                cnv(1417398904),
+                cnv(1897591937),
+                cnv(1066647396),
+                cnv(1682806907),
+                cnv(1015795079),
+                cnv(1619482808),
+            ],
+        ];
+
+        let internal_constants: Vec<BabyBear> = vec![
+            cnv(1518359488),
+            cnv(1765533241),
+            cnv(945325693),
+            cnv(422793067),
+            cnv(311365592),
+            cnv(1311448267),
+            cnv(1629555936),
+            cnv(1009879353),
+            cnv(190525218),
+            cnv(786108885),
+            cnv(557776863),
+            cnv(212616710),
+            cnv(605745517),
+        ];
+
+        let poseidon2: Poseidon2<BabyBear, Poseidon2ExternalMatrixGeneral, DiffusionMatrixBabyBear, WIDTH, ALPHA> =
+            Poseidon2::new(
+                rounds_f,
+                external_constants.clone(),
+                Poseidon2ExternalMatrixGeneral::default(),
+                rounds_p,
+                internal_constants.clone(),
+                DiffusionMatrixBabyBear::default(),
+            );
+
+        let mut input: [BabyBear; WIDTH] = [BabyBear::zero(); WIDTH];
+        for i in 0..WIDTH {
+            input[i] = BabyBear::from_canonical_u32(i as u32);
+        }
+
+        let output = poseidon2.permute(input);
+
+        let mut kats: [ScalarField; WIDTH] = [ScalarField::zero(); WIDTH];
+        for i in 0..WIDTH {
+            kats[i] = ScalarField::from_u32(output[i].as_canonical_u32());
+        }
+
+        let ctx = DeviceContext::default();
+        let mut round_constants = vec![ScalarField::zero(); rounds_f * WIDTH + rounds_p];
+        let external_constants_flattened: Vec<ScalarField> = external_constants
+            .into_iter()
+            .flatten()
+            .map(|c| ScalarField::from_u32(c.as_canonical_u32()))
+            .collect();
+        let internal_constants_icicle: Vec<ScalarField> = internal_constants
+            .iter()
+            .map(|&c| ScalarField::from_u32(c.as_canonical_u32()))
+            .collect();
+
+        (&mut round_constants[..rounds_f / 2 * WIDTH])
+            .copy_from_slice(&external_constants_flattened[..rounds_f / 2 * WIDTH]);
+        (&mut round_constants[rounds_f / 2 * WIDTH..rounds_f / 2 * WIDTH + rounds_p])
+            .copy_from_slice(&internal_constants_icicle[..]);
+        (&mut round_constants[rounds_p + rounds_f / 2 * WIDTH..])
+            .copy_from_slice(&external_constants_flattened[rounds_f / 2 * WIDTH..]);
+
+        let mut internal_matrix_diag = vec![
+            ScalarField::from_u32(0x78000001 - 2),
+            ScalarField::from_u32(1),
+            ScalarField::from_u32(1 << 1),
+            ScalarField::from_u32(1 << 2),
+            ScalarField::from_u32(1 << 3),
+            ScalarField::from_u32(1 << 4),
+            ScalarField::from_u32(1 << 5),
+            ScalarField::from_u32(1 << 6),
+            ScalarField::from_u32(1 << 7),
+            ScalarField::from_u32(1 << 8),
+            ScalarField::from_u32(1 << 9),
+            ScalarField::from_u32(1 << 10),
+            ScalarField::from_u32(1 << 11),
+            ScalarField::from_u32(1 << 12),
+            ScalarField::from_u32(1 << 13),
+            ScalarField::from_u32(1 << 15),
+        ];
+        let mut constants = create_optimized_poseidon2_constants(
+            WIDTH as u32,
+            ALPHA as u32,
+            &ctx,
+            rounds_p as u32,
+            rounds_f as u32,
+            &mut round_constants,
+            &mut internal_matrix_diag,
+            MdsType::Plonky,
+            DiffusionStrategy::Montgomery,
+        )
+        .unwrap();
+        check_poseidon_kats(WIDTH, &kats, &constants);
+    }
+
+    #[test]
+    fn test_poseidon2_plonky3_t24() {
+        let rounds_p = 21;
+        let rounds_f = 8;
+        const ALPHA: u64 = 7;
+        const WIDTH: usize = 24;
+
+        let cnv = BabyBear::from_canonical_u32;
+        let external_constants: Vec<[BabyBear; 24]> = vec![
+            [
+                cnv(262278199),
+                cnv(127253399),
+                cnv(314968988),
+                cnv(246143118),
+                cnv(157582794),
+                cnv(118043943),
+                cnv(454905424),
+                cnv(815798990),
+                cnv(1004040026),
+                cnv(1773108264),
+                cnv(1066694495),
+                cnv(1930780904),
+                cnv(1180307149),
+                cnv(1464793095),
+                cnv(1660766320),
+                cnv(1389166148),
+                cnv(343354132),
+                cnv(1307439985),
+                cnv(638242172),
+                cnv(525458520),
+                cnv(1964135730),
+                cnv(1751797115),
+                cnv(1421525369),
+                cnv(831813382),
+            ],
+            [
+                cnv(695835963),
+                cnv(1845603984),
+                cnv(540703332),
+                cnv(1333667262),
+                cnv(1917861751),
+                cnv(1170029417),
+                cnv(1989924532),
+                cnv(1518763784),
+                cnv(1339793538),
+                cnv(622609176),
+                cnv(686842369),
+                cnv(1737016378),
+                cnv(1282239129),
+                cnv(897025192),
+                cnv(716894289),
+                cnv(1997503974),
+                cnv(395622276),
+                cnv(1201063290),
+                cnv(1917549072),
+                cnv(1150912935),
+                cnv(1687379185),
+                cnv(1507936940),
+                cnv(241306552),
+                cnv(989176635),
+            ],
+            [
+                cnv(1147522062),
+                cnv(27129487),
+                cnv(1257820264),
+                cnv(142102402),
+                cnv(217046702),
+                cnv(1664590951),
+                cnv(855276054),
+                cnv(1215259350),
+                cnv(946500736),
+                cnv(552696906),
+                cnv(1424297384),
+                cnv(538103555),
+                cnv(1608853840),
+                cnv(162510541),
+                cnv(623051854),
+                cnv(1549062383),
+                cnv(1908416316),
+                cnv(1622328571),
+                cnv(1079030649),
+                cnv(1584033957),
+                cnv(1099252725),
+                cnv(1910423126),
+                cnv(447555988),
+                cnv(862495875),
+            ],
+            [
+                cnv(128479034),
+                cnv(1587822577),
+                cnv(608401422),
+                cnv(1290028279),
+                cnv(342857858),
+                cnv(825405577),
+                cnv(427731030),
+                cnv(1718628547),
+                cnv(588764636),
+                cnv(204228775),
+                cnv(1454563174),
+                cnv(1740472809),
+                cnv(1338899225),
+                cnv(1269493554),
+                cnv(53007114),
+                cnv(1647670797),
+                cnv(306391314),
+                cnv(172614232),
+                cnv(51256176),
+                cnv(1221257987),
+                cnv(1239734761),
+                cnv(273790406),
+                cnv(1781980094),
+                cnv(1291790245),
+            ],
+            [
+                cnv(53041581),
+                cnv(723038058),
+                cnv(1439947916),
+                cnv(1136469704),
+                cnv(205609311),
+                cnv(1883820770),
+                cnv(14387587),
+                cnv(720724951),
+                cnv(1854174607),
+                cnv(1629316321),
+                cnv(530151394),
+                cnv(1679178250),
+                cnv(1549779579),
+                cnv(48375137),
+                cnv(976057819),
+                cnv(463976218),
+                cnv(875839332),
+                cnv(1946596189),
+                cnv(434078361),
+                cnv(1878280202),
+                cnv(1363837384),
+                cnv(1470845646),
+                cnv(1792450386),
+                cnv(1040977421),
+            ],
+            [
+                cnv(1209164052),
+                cnv(714957516),
+                cnv(390340387),
+                cnv(1213686459),
+                cnv(790726260),
+                cnv(117294666),
+                cnv(140621810),
+                cnv(993455846),
+                cnv(1889603648),
+                cnv(78845751),
+                cnv(925018226),
+                cnv(708123747),
+                cnv(1647665372),
+                cnv(1649953458),
+                cnv(942439428),
+                cnv(1006235079),
+                cnv(238616145),
+                cnv(930036496),
+                cnv(1401020792),
+                cnv(989618631),
+                cnv(1545325389),
+                cnv(1715719711),
+                cnv(755691969),
+                cnv(150307788),
+            ],
+            [
+                cnv(1567618575),
+                cnv(1663353317),
+                cnv(1950429111),
+                cnv(1891637550),
+                cnv(192082241),
+                cnv(1080533265),
+                cnv(1463323727),
+                cnv(890243564),
+                cnv(158646617),
+                cnv(1402624179),
+                cnv(59510015),
+                cnv(1198261138),
+                cnv(1065075039),
+                cnv(1150410028),
+                cnv(1293938517),
+                cnv(76770019),
+                cnv(1478577620),
+                cnv(1748789933),
+                cnv(457372011),
+                cnv(1841795381),
+                cnv(760115692),
+                cnv(1042892522),
+                cnv(1507649755),
+                cnv(1827572010),
+            ],
+            [
+                cnv(1206940496),
+                cnv(1896271507),
+                cnv(1003792297),
+                cnv(738091882),
+                cnv(1124078057),
+                cnv(1889898),
+                cnv(813674331),
+                cnv(228520958),
+                cnv(1832911930),
+                cnv(781141772),
+                cnv(459826664),
+                cnv(202271745),
+                cnv(1296144415),
+                cnv(1111203133),
+                cnv(1090783436),
+                cnv(641665156),
+                cnv(1393671120),
+                cnv(1303271640),
+                cnv(809508074),
+                cnv(162506101),
+                cnv(1262312258),
+                cnv(1672219447),
+                cnv(1608891156),
+                cnv(1380248020),
+            ],
+        ];
+
+        let internal_constants: Vec<BabyBear> = vec![
+            cnv(497520322),
+            cnv(1930103076),
+            cnv(1052077299),
+            cnv(1540960371),
+            cnv(924863639),
+            cnv(1365519753),
+            cnv(1726563304),
+            cnv(440300254),
+            cnv(1891545577),
+            cnv(822033215),
+            cnv(1111544260),
+            cnv(308575117),
+            cnv(1708681573),
+            cnv(1240419708),
+            cnv(1199068823),
+            cnv(1186174623),
+            cnv(1551596046),
+            cnv(1886977120),
+            cnv(1327682690),
+            cnv(1210751726),
+            cnv(1810596765),
+        ];
+
+        let poseidon2: Poseidon2<BabyBear, Poseidon2ExternalMatrixGeneral, DiffusionMatrixBabyBear, WIDTH, ALPHA> =
+            Poseidon2::new(
+                rounds_f,
+                external_constants.clone(),
+                Poseidon2ExternalMatrixGeneral::default(),
+                rounds_p,
+                internal_constants.clone(),
+                DiffusionMatrixBabyBear::default(),
+            );
+
+        let mut input: [BabyBear; WIDTH] = [BabyBear::zero(); WIDTH];
+        for i in 0..WIDTH {
+            input[i] = BabyBear::from_canonical_u32(i as u32);
+        }
+
+        let output = poseidon2.permute(input);
+
+        let mut kats: [ScalarField; WIDTH] = [ScalarField::zero(); WIDTH];
+        for i in 0..WIDTH {
+            kats[i] = ScalarField::from_u32(output[i].as_canonical_u32());
+        }
+
+        let ctx = DeviceContext::default();
+        let mut round_constants = vec![ScalarField::zero(); rounds_f * WIDTH + rounds_p];
+        let external_constants_flattened: Vec<ScalarField> = external_constants
+            .into_iter()
+            .flatten()
+            .map(|c| ScalarField::from_u32(c.as_canonical_u32()))
+            .collect();
+        let internal_constants_icicle: Vec<ScalarField> = internal_constants
+            .iter()
+            .map(|&c| ScalarField::from_u32(c.as_canonical_u32()))
+            .collect();
+
+        (&mut round_constants[..rounds_f / 2 * WIDTH])
+            .copy_from_slice(&external_constants_flattened[..rounds_f / 2 * WIDTH]);
+        (&mut round_constants[rounds_f / 2 * WIDTH..rounds_f / 2 * WIDTH + rounds_p])
+            .copy_from_slice(&internal_constants_icicle[..]);
+        (&mut round_constants[rounds_p + rounds_f / 2 * WIDTH..])
+            .copy_from_slice(&external_constants_flattened[rounds_f / 2 * WIDTH..]);
+
+        let mut internal_matrix_diag = vec![
+            ScalarField::from_u32(0x78000001 - 2),
+            ScalarField::from_u32(1),
+            ScalarField::from_u32(1 << 1),
+            ScalarField::from_u32(1 << 2),
+            ScalarField::from_u32(1 << 3),
+            ScalarField::from_u32(1 << 4),
+            ScalarField::from_u32(1 << 5),
+            ScalarField::from_u32(1 << 6),
+            ScalarField::from_u32(1 << 7),
+            ScalarField::from_u32(1 << 8),
+            ScalarField::from_u32(1 << 9),
+            ScalarField::from_u32(1 << 10),
+            ScalarField::from_u32(1 << 11),
+            ScalarField::from_u32(1 << 12),
+            ScalarField::from_u32(1 << 13),
+            ScalarField::from_u32(1 << 14),
+            ScalarField::from_u32(1 << 15),
+            ScalarField::from_u32(1 << 16),
+            ScalarField::from_u32(1 << 18),
+            ScalarField::from_u32(1 << 19),
+            ScalarField::from_u32(1 << 20),
+            ScalarField::from_u32(1 << 21),
+            ScalarField::from_u32(1 << 22),
+            ScalarField::from_u32(1 << 23),
+        ];
+        let mut constants = create_optimized_poseidon2_constants(
+            WIDTH as u32,
+            ALPHA as u32,
+            &ctx,
+            rounds_p as u32,
+            rounds_f as u32,
+            &mut round_constants,
+            &mut internal_matrix_diag,
+            MdsType::Plonky,
+            DiffusionStrategy::Montgomery,
+        )
+        .unwrap();
+        check_poseidon_kats(WIDTH, &kats, &constants);
+    }
+}