Feat/roman/tree builder (#525)

# Updates:

## Hashing

 - Added SpongeHasher class
 - Can be used to accept any hash function as an argument
 - Absorb and squeeze are now separated
- Memory management is now mostly done by SpongeHasher class, each hash
function only describes permutation kernels

## Tree builder

 - Tree builder is now hash-agnostic. 
 - Tree builder now supports 2D input (matrices)
- Tree builder can now use two different hash functions for layer 0 and
compression layers

## Poseidon1

 - Interface changed to classes
 - Now allows for any alpha
 - Now allows passing constants not in a single vector
 - Now allows for any domain tag
 - Constants are now released upon going out of scope
 - Rust wrappers changed to Poseidon struct
 
 ## Poseidon2
 
 - Interface changed to classes
 - Constants are now released upon going out of scope
 - Rust wrappers changed to Poseidon2 struct
 
## Keccak

 - Added Keccak class which inherits SpongeHasher
 - Now doesn't use gpu registers for storing states
 
 To do:
- [x] Update poseidon1 golang bindings
- [x] Update poseidon1 examples
- [x] Fix poseidon2 cuda test
- [x] Fix poseidon2 merkle tree builder test
- [x] Update keccak class with new design
- [x] Update keccak test
- [x] Check keccak correctness
- [x] Update tree builder rust wrappers
- [x] Leave doc comments

Future work:  
- [ ] Add keccak merkle tree builder externs
- [ ] Add keccak rust tree builder wrappers
- [ ] Write docs
- [ ] Add example
- [ ] Fix device output for tree builder

---------

Co-authored-by: Jeremy Felder <jeremy.felder1@gmail.com>
Co-authored-by: nonam3e <71525212+nonam3e@users.noreply.github.com>
This commit is contained in:
ChickenLover
2024-07-11 13:46:25 +07:00
committed by GitHub
parent 2d4059c61f
commit 7fd9ed1b49
125 changed files with 8002 additions and 4097 deletions

View File

@@ -6,6 +6,9 @@
#include "api/bn254.h"
#include "gpu-utils/error_handler.cuh"
#include "poseidon/poseidon.cuh"
#include "hash/hash.cuh"
using namespace poseidon;
using namespace bn254;
@@ -20,31 +23,20 @@ void checkCudaError(cudaError_t error)
// these global constants go into template calls
const int size_col = 11;
// this function executes the Poseidon thread
void threadPoseidon(
device_context::DeviceContext ctx,
unsigned size_partition,
scalar_t* layers,
scalar_t* column_hashes,
PoseidonConstants<scalar_t>* constants)
Poseidon<scalar_t> * poseidon)
{
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
// CHK_IF_RETURN(); I can't use it in a standard thread function
PoseidonConfig column_config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // input_is_a_state
false, // aligned
false, // loop_state
false, // is_async
};
cudaError_t err =
bn254_poseidon_hash_cuda(layers, column_hashes, (size_t)size_partition, size_col, *constants, column_config);
SpongeConfig column_config = default_sponge_config(ctx);
cudaError_t err = poseidon->hash_many(layers, column_hashes, (size_t) size_partition, size_col, 1, column_config);
checkCudaError(err);
}
@@ -59,6 +51,11 @@ using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::p
exit(EXIT_FAILURE); \
}
#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
exit(EXIT_FAILURE); \
}
int main()
{
const unsigned size_row = (1 << 30);
@@ -116,19 +113,18 @@ int main()
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash1);
PoseidonConstants<scalar_t> column_constants0, column_constants1;
bn254_init_optimized_poseidon_constants_cuda(size_col, ctx0, &column_constants0);
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
bn254_init_optimized_poseidon_constants_cuda(size_col, ctx1, &column_constants1);
Poseidon<scalar_t> column_poseidon0(size_col, ctx0);
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
Poseidon<scalar_t> column_poseidon1(size_col, ctx1);
std::cout << "Parallel execution of Poseidon threads" << std::endl;
START_TIMER(parallel);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_poseidon1);
// Wait for the threads to finish
thread0.join();
@@ -141,9 +137,9 @@ int main()
std::cout << "Sequential execution of Poseidon threads" << std::endl;
START_TIMER(sequential);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_poseidon0);
thread2.join();
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_poseidon0);
thread3.join();
END_TIMER(sequential, "1 GPU");
std::cout << "Output Data from Thread 2: ";

View File

@@ -3,13 +3,11 @@
#include "polynomials/polynomials.h"
#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
#include "ntt/ntt.cuh"
#include "poseidon/tree/merkle.cuh"
#include "api/bn254.h"
#include <chrono>
// using namespace field_config;
using namespace polynomials;
using namespace merkle;
using namespace bn254;
// define the polynomial type

View File

@@ -4,6 +4,8 @@
#include "api/bn254.h"
#include "curves/params/bn254.cuh"
#include "poseidon/poseidon.cuh"
#include "hash/hash.cuh"
using namespace poseidon;
using namespace bn254;
@@ -14,13 +16,12 @@ inline uint32_t tree_index(uint32_t level, uint32_t offset) { return (1 << level
// We assume the tree has leaves already set, compute all other levels
void build_tree(
const uint32_t tree_height, scalar_t* tree, PoseidonConstants<scalar_t>* constants, PoseidonConfig config)
const uint32_t tree_height, scalar_t* tree, Poseidon<scalar_t> &poseidon, SpongeConfig &config)
{
for (uint32_t level = tree_height - 1; level > 0; level--) {
const uint32_t next_level = level - 1;
const uint32_t next_level_width = 1 << next_level;
bn254_poseidon_hash_cuda(
&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, *constants, config);
poseidon.hash_many(&tree[tree_index(level, 0)], &tree[tree_index(next_level, 0)], next_level_width, 2, 1, config);
}
}
@@ -65,8 +66,8 @@ uint32_t validate_proof(
const uint32_t tree_height,
const uint32_t* proof_lr,
const scalar_t* proof_hash,
PoseidonConstants<scalar_t>* constants,
PoseidonConfig config)
Poseidon<scalar_t> &poseidon,
SpongeConfig &config)
{
scalar_t hashes_in[2], hash_out[1], level_hash;
level_hash = hash;
@@ -79,7 +80,7 @@ uint32_t validate_proof(
hashes_in[1] = level_hash;
}
// next level hash
bn254_poseidon_hash_cuda(hashes_in, hash_out, 1, 2, *constants, config);
poseidon.hash_many(hashes_in, hash_out, 1, 2, 1, config);
level_hash = hash_out[0];
}
return proof_hash[0] == level_hash;
@@ -109,16 +110,15 @@ int main(int argc, char* argv[])
d = d + scalar_t::one();
}
std::cout << "Hashing blocks into tree leaves..." << std::endl;
PoseidonConstants<scalar_t> constants;
bn254_init_optimized_poseidon_constants_cuda(data_arity, ctx, &constants);
PoseidonConfig config = default_poseidon_config(data_arity + 1);
bn254_poseidon_hash_cuda(data, &tree[tree_index(leaf_level, 0)], tree_width, 4, constants, config);
Poseidon<scalar_t> poseidon(data_arity, ctx);
SpongeConfig config = default_sponge_config(ctx);
poseidon.hash_many(data, &tree[tree_index(leaf_level, 0)], tree_width, data_arity, 1, config);
std::cout << "3. Building Merkle tree" << std::endl;
PoseidonConstants<scalar_t> tree_constants;
bn254_init_optimized_poseidon_constants_cuda(tree_arity, ctx, &tree_constants);
PoseidonConfig tree_config = default_poseidon_config(tree_arity + 1);
build_tree(tree_height, tree, &tree_constants, tree_config);
Poseidon<scalar_t> tree_poseidon(tree_arity, ctx);
SpongeConfig tree_config = default_sponge_config(ctx);
build_tree(tree_height, tree, tree_poseidon, tree_config);
std::cout << "4. Generate membership proof" << std::endl;
uint32_t position = tree_width - 1;
@@ -133,13 +133,13 @@ int main(int argc, char* argv[])
std::cout << "5. Validate the hash membership" << std::endl;
uint32_t validated;
const scalar_t hash = tree[tree_index(leaf_level, query_position)];
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
validated = validate_proof(hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
std::cout << "Validated: " << validated << std::endl;
std::cout << "6. Tamper the hash" << std::endl;
const scalar_t tampered_hash = hash + scalar_t::one();
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, &tree_constants, tree_config);
validated = validate_proof(tampered_hash, tree_height, proof_lr, proof_hash, tree_poseidon, tree_config);
std::cout << "7. Invalidate tamper hash membership" << std::endl;
std::cout << "Validated: " << validated << std::endl;
return 0;

View File

@@ -2,7 +2,8 @@ use icicle_bls12_381::curve::ScalarField as F;
use icicle_cuda_runtime::device_context::DeviceContext;
use icicle_core::poseidon::{load_optimized_poseidon_constants, poseidon_hash_many, PoseidonConfig};
use icicle_core::hash::{SpongeHash, SpongeConfig};
use icicle_core::poseidon::Poseidon;
use icicle_core::traits::FieldImpl;
use icicle_cuda_runtime::memory::HostSlice;
@@ -24,14 +25,14 @@ fn main() {
let test_size = 1 << size;
println!("Running Icicle Examples: Rust Poseidon Hash");
let arity = 2u32;
let arity = 2;
println!(
"---------------------- Loading optimized Poseidon constants for arity={} ------------------------",
arity
);
let ctx = DeviceContext::default();
let constants = load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap();
let config = PoseidonConfig::default();
let poseidon = Poseidon::load(arity, &ctx).unwrap();
let config = SpongeConfig::default();
println!(
"---------------------- Input size 2^{}={} ------------------------",
@@ -45,12 +46,12 @@ fn main() {
println!("Executing BLS12-381 Poseidon Hash on device...");
#[cfg(feature = "profile")]
let start = Instant::now();
poseidon_hash_many::<F>(
poseidon.hash_many(
input_slice,
output_slice,
test_size as u32,
arity as u32,
&constants,
test_size,
arity,
1,
&config,
)
.unwrap();

View File

@@ -9,58 +9,67 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/babybear.cuh"
#include "ntt/ntt.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
extern "C" cudaError_t babybear_extension_ntt_cuda(
const babybear::extension_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::extension_t* output);
extern "C" cudaError_t babybear_initialize_domain(
babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t babybear_poseidon2_create_cuda(
poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const babybear::scalar_t* round_constants,
const babybear::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx
);
extern "C" cudaError_t babybear_ntt_cuda(
const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::scalar_t* output);
extern "C" cudaError_t babybear_poseidon2_load_cuda(
poseidon2::Poseidon2<babybear::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx
);
extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<babybear::scalar_t>* poseidon,
const babybear::scalar_t* inputs,
babybear::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
extern "C" cudaError_t
babybear_poseidon2_delete_cuda(poseidon2::Poseidon2<babybear::scalar_t>* poseidon, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_scalar_convert_montgomery(
babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_build_merkle_tree(
const babybear::scalar_t* leaves,
babybear::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t babybear_extension_mul_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_add_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_accumulate_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t babybear_extension_sub_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
const babybear::extension_t* input,
uint32_t row_size,
uint32_t column_size,
babybear::extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);
extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
extern "C" cudaError_t babybear_extension_scalar_convert_montgomery(
babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_mmcs_commit_cuda(
const matrix::Matrix<babybear::scalar_t>* leaves,
unsigned int number_of_inputs,
babybear::scalar_t* digests,
const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* hasher,
const hash::SpongeHasher<babybear::scalar_t, babybear::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t babybear_mul_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);
@@ -87,35 +96,47 @@ extern "C" cudaError_t babybear_bit_reverse_cuda(
const babybear::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::scalar_t* output);
extern "C" cudaError_t babybear_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
const babybear::scalar_t* round_constants,
const babybear::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
extern "C" cudaError_t babybear_scalar_convert_montgomery(
babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_initialize_domain(
babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t babybear_ntt_cuda(
const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<babybear::scalar_t>& config, babybear::scalar_t* output);
extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx);
extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
extern "C" cudaError_t babybear_extension_scalar_convert_montgomery(
babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t babybear_extension_mul_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_add_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_accumulate_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t babybear_extension_sub_cuda(
babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result);
extern "C" cudaError_t babybear_extension_transpose_matrix_cuda(
const babybear::extension_t* input,
uint32_t row_size,
uint32_t column_size,
babybear::extension_t* output,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
bool on_device,
bool is_async);
extern "C" cudaError_t babybear_init_poseidon2_constants_cuda(
int width,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<babybear::scalar_t>* poseidon_constants);
extern "C" cudaError_t babybear_extension_bit_reverse_cuda(
const babybear::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, babybear::extension_t* output);
extern "C" cudaError_t babybear_poseidon2_hash_cuda(
const babybear::scalar_t* input,
babybear::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<babybear::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
extern "C" cudaError_t babybear_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<babybear::scalar_t>* constants,
device_context::DeviceContext& ctx);
#endif

View File

@@ -9,26 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bls12_377.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery(
bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery(
bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_g2_precompute_msm_bases_cuda(
bls12_377::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bls12_377_precompute_msm_bases_cuda(
extern "C" cudaError_t bls12_377_msm_cuda(
const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_377::projective_t* out);
extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
extern "C" cudaError_t bls12_377_g2_affine_convert_montgomery(
bls12_377::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_g2_projective_convert_montgomery(
bls12_377::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_ecntt_cuda(
const bls12_377::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::projective_t* output);
@@ -65,18 +66,52 @@ extern "C" cudaError_t bls12_377_affine_convert_montgomery(
extern "C" cudaError_t bls12_377_projective_convert_montgomery(
bls12_377::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_initialize_domain(
bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bls12_377_build_merkle_tree(
const bls12_377::scalar_t* leaves,
bls12_377::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_377_ntt_cuda(
const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::scalar_t* output);
extern "C" cudaError_t bls12_377_mmcs_commit_cuda(
const matrix::Matrix<bls12_377::scalar_t>* leaves,
unsigned int number_of_inputs,
bls12_377::scalar_t* digests,
const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* hasher,
const hash::SpongeHasher<bls12_377::scalar_t, bls12_377::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_poseidon_create_cuda(
poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bls12_377::scalar_t* round_constants,
const bls12_377::scalar_t* mds_matrix,
const bls12_377::scalar_t* non_sparse_matrix,
const bls12_377::scalar_t* sparse_matrices,
const bls12_377::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
extern "C" cudaError_t bls12_377_poseidon_load_cuda(
poseidon::Poseidon<bls12_377::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_poseidon_hash_many_cuda(
const poseidon::Poseidon<bls12_377::scalar_t>* poseidon,
const bls12_377::scalar_t* inputs,
bls12_377::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t
bls12_377_poseidon_delete_cuda(poseidon::Poseidon<bls12_377::scalar_t>* poseidon);
extern "C" cudaError_t bls12_377_mul_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bls12_377_bit_reverse_cuda(
const bls12_377::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_377::scalar_t* output);
extern "C" cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bls12_377::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bls12_377::scalar_t>* poseidon_constants);
extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
extern "C" cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_377::scalar_t>* constants);
extern "C" cudaError_t bls12_377_scalar_convert_montgomery(
bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_377_poseidon_hash_cuda(
bls12_377::scalar_t* input,
bls12_377::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bls12_377_initialize_domain(
bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree(
const bls12_377::scalar_t* leaves,
bls12_377::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bls12_377::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t bls12_377_ntt_cuda(
const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_377::scalar_t>& config, bls12_377::scalar_t* output);
extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx);
#endif

View File

@@ -9,26 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bls12_381.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery(
bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery(
bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_g2_precompute_msm_bases_cuda(
bls12_381::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bls12_381_precompute_msm_bases_cuda(
extern "C" cudaError_t bls12_381_msm_cuda(
const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, msm::MSMConfig& config, bls12_381::projective_t* out);
extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
extern "C" cudaError_t bls12_381_g2_affine_convert_montgomery(
bls12_381::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_g2_projective_convert_montgomery(
bls12_381::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_ecntt_cuda(
const bls12_381::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::projective_t* output);
@@ -65,18 +66,52 @@ extern "C" cudaError_t bls12_381_affine_convert_montgomery(
extern "C" cudaError_t bls12_381_projective_convert_montgomery(
bls12_381::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_initialize_domain(
bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bls12_381_build_merkle_tree(
const bls12_381::scalar_t* leaves,
bls12_381::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_381_ntt_cuda(
const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::scalar_t* output);
extern "C" cudaError_t bls12_381_mmcs_commit_cuda(
const matrix::Matrix<bls12_381::scalar_t>* leaves,
unsigned int number_of_inputs,
bls12_381::scalar_t* digests,
const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* hasher,
const hash::SpongeHasher<bls12_381::scalar_t, bls12_381::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_poseidon_create_cuda(
poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bls12_381::scalar_t* round_constants,
const bls12_381::scalar_t* mds_matrix,
const bls12_381::scalar_t* non_sparse_matrix,
const bls12_381::scalar_t* sparse_matrices,
const bls12_381::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
extern "C" cudaError_t bls12_381_poseidon_load_cuda(
poseidon::Poseidon<bls12_381::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_poseidon_hash_many_cuda(
const poseidon::Poseidon<bls12_381::scalar_t>* poseidon,
const bls12_381::scalar_t* inputs,
bls12_381::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t
bls12_381_poseidon_delete_cuda(poseidon::Poseidon<bls12_381::scalar_t>* poseidon);
extern "C" cudaError_t bls12_381_mul_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bls12_381_bit_reverse_cuda(
const bls12_381::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bls12_381::scalar_t* output);
extern "C" cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bls12_381::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bls12_381::scalar_t>* poseidon_constants);
extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
extern "C" cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bls12_381::scalar_t>* constants);
extern "C" cudaError_t bls12_381_scalar_convert_montgomery(
bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bls12_381_poseidon_hash_cuda(
bls12_381::scalar_t* input,
bls12_381::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bls12_381_initialize_domain(
bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree(
const bls12_381::scalar_t* leaves,
bls12_381::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bls12_381::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t bls12_381_ntt_cuda(
const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bls12_381::scalar_t>& config, bls12_381::scalar_t* output);
extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx);
#endif

View File

@@ -9,28 +9,15 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bn254.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
extern "C" cudaError_t bn254_g2_affine_convert_montgomery(
bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_g2_projective_convert_montgomery(
bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_g2_precompute_msm_bases_cuda(
bn254::g2_affine_t* bases,
int msm_size,
@@ -49,6 +36,20 @@ extern "C" cudaError_t bn254_precompute_msm_bases_cuda(
extern "C" cudaError_t bn254_msm_cuda(
const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, msm::MSMConfig& config, bn254::projective_t* out);
extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
extern "C" cudaError_t bn254_g2_affine_convert_montgomery(
bn254::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_g2_projective_convert_montgomery(
bn254::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_ecntt_cuda(
const bn254::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::projective_t* output);
@@ -66,18 +67,87 @@ extern "C" cudaError_t bn254_affine_convert_montgomery(
extern "C" cudaError_t bn254_projective_convert_montgomery(
bn254::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_initialize_domain(
bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bn254_poseidon2_create_cuda(
poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const bn254::scalar_t* round_constants,
const bn254::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx
);
extern "C" cudaError_t bn254_ntt_cuda(
const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::scalar_t* output);
extern "C" cudaError_t bn254_poseidon2_load_cuda(
poseidon2::Poseidon2<bn254::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx
);
extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<bn254::scalar_t>* poseidon,
const bn254::scalar_t* inputs,
bn254::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
extern "C" cudaError_t
bn254_poseidon2_delete_cuda(poseidon2::Poseidon2<bn254::scalar_t>* poseidon, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_scalar_convert_montgomery(
bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_build_merkle_tree(
const bn254::scalar_t* leaves,
bn254::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bn254_mmcs_commit_cuda(
const matrix::Matrix<bn254::scalar_t>* leaves,
unsigned int number_of_inputs,
bn254::scalar_t* digests,
const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* hasher,
const hash::SpongeHasher<bn254::scalar_t, bn254::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bn254_poseidon_create_cuda(
poseidon::Poseidon<bn254::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bn254::scalar_t* round_constants,
const bn254::scalar_t* mds_matrix,
const bn254::scalar_t* non_sparse_matrix,
const bn254::scalar_t* sparse_matrices,
const bn254::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon_load_cuda(
poseidon::Poseidon<bn254::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon_hash_many_cuda(
const poseidon::Poseidon<bn254::scalar_t>* poseidon,
const bn254::scalar_t* inputs,
bn254::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t
bn254_poseidon_delete_cuda(poseidon::Poseidon<bn254::scalar_t>* poseidon);
extern "C" cudaError_t bn254_mul_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);
@@ -104,62 +174,17 @@ extern "C" cudaError_t bn254_bit_reverse_cuda(
const bn254::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bn254::scalar_t* output);
extern "C" cudaError_t bn254_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
const bn254::scalar_t* round_constants,
const bn254::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
extern "C" cudaError_t bn254_init_poseidon2_constants_cuda(
int width,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<bn254::scalar_t>* poseidon_constants);
extern "C" cudaError_t bn254_scalar_convert_montgomery(
bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_poseidon2_hash_cuda(
const bn254::scalar_t* input,
bn254::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<bn254::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
extern "C" cudaError_t bn254_initialize_domain(
bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bn254_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<bn254::scalar_t>* constants,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bn254_ntt_cuda(
const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bn254::scalar_t>& config, bn254::scalar_t* output);
extern "C" cudaError_t bn254_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bn254::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bn254::scalar_t>* poseidon_constants);
extern "C" cudaError_t bn254_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bn254::scalar_t>* constants);
extern "C" cudaError_t bn254_poseidon_hash_cuda(
bn254::scalar_t* input,
bn254::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bn254::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bn254_build_poseidon_merkle_tree(
const bn254::scalar_t* leaves,
bn254::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bn254::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx);
#endif

View File

@@ -9,26 +9,13 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/bw6_761.cuh"
#include "ntt/ntt.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery(
bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery(
bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_g2_precompute_msm_bases_cuda(
bw6_761::g2_affine_t* bases,
@@ -48,6 +35,20 @@ extern "C" cudaError_t bw6_761_precompute_msm_bases_cuda(
extern "C" cudaError_t bw6_761_msm_cuda(
const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, msm::MSMConfig& config, bw6_761::projective_t* out);
extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
extern "C" cudaError_t bw6_761_g2_affine_convert_montgomery(
bw6_761::g2_affine_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_g2_projective_convert_montgomery(
bw6_761::g2_projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_ecntt_cuda(
const bw6_761::projective_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::projective_t* output);
@@ -65,18 +66,52 @@ extern "C" cudaError_t bw6_761_affine_convert_montgomery(
extern "C" cudaError_t bw6_761_projective_convert_montgomery(
bw6_761::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_initialize_domain(
bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bw6_761_build_merkle_tree(
const bw6_761::scalar_t* leaves,
bw6_761::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bw6_761_ntt_cuda(
const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::scalar_t* output);
extern "C" cudaError_t bw6_761_mmcs_commit_cuda(
const matrix::Matrix<bw6_761::scalar_t>* leaves,
unsigned int number_of_inputs,
bw6_761::scalar_t* digests,
const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* hasher,
const hash::SpongeHasher<bw6_761::scalar_t, bw6_761::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_poseidon_create_cuda(
poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const bw6_761::scalar_t* round_constants,
const bw6_761::scalar_t* mds_matrix,
const bw6_761::scalar_t* non_sparse_matrix,
const bw6_761::scalar_t* sparse_matrices,
const bw6_761::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
extern "C" cudaError_t bw6_761_poseidon_load_cuda(
poseidon::Poseidon<bw6_761::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_poseidon_hash_many_cuda(
const poseidon::Poseidon<bw6_761::scalar_t>* poseidon,
const bw6_761::scalar_t* inputs,
bw6_761::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t
bw6_761_poseidon_delete_cuda(poseidon::Poseidon<bw6_761::scalar_t>* poseidon);
extern "C" cudaError_t bw6_761_mul_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);
@@ -103,31 +138,17 @@ extern "C" cudaError_t bw6_761_bit_reverse_cuda(
const bw6_761::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, bw6_761::scalar_t* output);
extern "C" cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const bw6_761::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<bw6_761::scalar_t>* poseidon_constants);
extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
extern "C" cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<bw6_761::scalar_t>* constants);
extern "C" cudaError_t bw6_761_scalar_convert_montgomery(
bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t bw6_761_poseidon_hash_cuda(
bw6_761::scalar_t* input,
bw6_761::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t bw6_761_initialize_domain(
bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree(
const bw6_761::scalar_t* leaves,
bw6_761::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<bw6_761::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t bw6_761_ntt_cuda(
const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<bw6_761::scalar_t>& config, bw6_761::scalar_t* output);
extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx);
#endif

View File

@@ -9,11 +9,12 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "curves/params/grumpkin.cuh"
#include "msm/msm.cuh"
#include "vec_ops/vec_ops.cuh"
#include "poseidon/poseidon.cuh"
#include "poseidon/tree/merkle.cuh"
extern "C" cudaError_t grumpkin_precompute_msm_bases_cuda(
grumpkin::affine_t* bases,
@@ -38,10 +39,52 @@ extern "C" cudaError_t grumpkin_affine_convert_montgomery(
extern "C" cudaError_t grumpkin_projective_convert_montgomery(
grumpkin::projective_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
extern "C" cudaError_t grumpkin_build_merkle_tree(
const grumpkin::scalar_t* leaves,
grumpkin::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_mmcs_commit_cuda(
const matrix::Matrix<grumpkin::scalar_t>* leaves,
unsigned int number_of_inputs,
grumpkin::scalar_t* digests,
const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* hasher,
const hash::SpongeHasher<grumpkin::scalar_t, grumpkin::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t grumpkin_poseidon_create_cuda(
poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const grumpkin::scalar_t* round_constants,
const grumpkin::scalar_t* mds_matrix,
const grumpkin::scalar_t* non_sparse_matrix,
const grumpkin::scalar_t* sparse_matrices,
const grumpkin::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_poseidon_load_cuda(
poseidon::Poseidon<grumpkin::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t grumpkin_poseidon_hash_many_cuda(
const poseidon::Poseidon<grumpkin::scalar_t>* poseidon,
const grumpkin::scalar_t* inputs,
grumpkin::scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t
grumpkin_poseidon_delete_cuda(poseidon::Poseidon<grumpkin::scalar_t>* poseidon);
extern "C" cudaError_t grumpkin_mul_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);
@@ -68,31 +111,9 @@ extern "C" cudaError_t grumpkin_bit_reverse_cuda(
const grumpkin::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, grumpkin::scalar_t* output);
extern "C" cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const grumpkin::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<grumpkin::scalar_t>* poseidon_constants);
extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
extern "C" cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<grumpkin::scalar_t>* constants);
extern "C" cudaError_t grumpkin_poseidon_hash_cuda(
grumpkin::scalar_t* input,
grumpkin::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
poseidon::PoseidonConfig& config);
extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree(
const grumpkin::scalar_t* leaves,
grumpkin::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<grumpkin::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t grumpkin_scalar_convert_montgomery(
grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
#endif

View File

@@ -9,43 +9,27 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/m31.cuh"
#include "vec_ops/vec_ops.cuh"
extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
extern "C" cudaError_t m31_build_merkle_tree(
const m31::scalar_t* leaves,
m31::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t m31_scalar_convert_montgomery(
m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t m31_extension_mul_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_add_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_accumulate_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t m31_extension_sub_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
const m31::extension_t* input,
uint32_t row_size,
uint32_t column_size,
m31::extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
extern "C" cudaError_t m31_extension_bit_reverse_cuda(
const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t m31_mmcs_commit_cuda(
const matrix::Matrix<m31::scalar_t>* leaves,
unsigned int number_of_inputs,
m31::scalar_t* digests,
const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* hasher,
const hash::SpongeHasher<m31::scalar_t, m31::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t m31_mul_cuda(
m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result);
@@ -72,4 +56,39 @@ extern "C" cudaError_t m31_bit_reverse_cuda(
const m31::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::scalar_t* output);
extern "C" void m31_generate_scalars(m31::scalar_t* scalars, int size);
extern "C" cudaError_t m31_scalar_convert_montgomery(
m31::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" void m31_extension_generate_scalars(m31::extension_t* scalars, int size);
extern "C" cudaError_t m31_extension_scalar_convert_montgomery(
m31::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t m31_extension_mul_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_add_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_accumulate_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);
extern "C" cudaError_t m31_extension_sub_cuda(
m31::extension_t* vec_a, m31::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::extension_t* result);
extern "C" cudaError_t m31_extension_transpose_matrix_cuda(
const m31::extension_t* input,
uint32_t row_size,
uint32_t column_size,
m31::extension_t* output,
device_context::DeviceContext& ctx,
bool on_device,
bool is_async);
extern "C" cudaError_t m31_extension_bit_reverse_cuda(
const m31::extension_t* input, uint64_t n, vec_ops::BitReverseConfig& config, m31::extension_t* output);
#endif

View File

@@ -9,22 +9,28 @@
#include <cuda_runtime.h>
#include "gpu-utils/device_context.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "fields/stark_fields/stark252.cuh"
#include "ntt/ntt.cuh"
#include "vec_ops/vec_ops.cuh"
extern "C" cudaError_t stark252_initialize_domain(
stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t stark252_build_merkle_tree(
const stark252::scalar_t* leaves,
stark252::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t stark252_ntt_cuda(
const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
extern "C" cudaError_t stark252_scalar_convert_montgomery(
stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t stark252_mmcs_commit_cuda(
const matrix::Matrix<stark252::scalar_t>* leaves,
unsigned int number_of_inputs,
stark252::scalar_t* digests,
const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* hasher,
const hash::SpongeHasher<stark252::scalar_t, stark252::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t stark252_mul_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);
@@ -51,4 +57,17 @@ extern "C" cudaError_t stark252_bit_reverse_cuda(
const stark252::scalar_t* input, uint64_t n, vec_ops::BitReverseConfig& config, stark252::scalar_t* output);
extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
extern "C" cudaError_t stark252_scalar_convert_montgomery(
stark252::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx);
extern "C" cudaError_t stark252_initialize_domain(
stark252::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode);
extern "C" cudaError_t stark252_ntt_cuda(
const stark252::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig<stark252::scalar_t>& config, stark252::scalar_t* output);
extern "C" cudaError_t stark252_release_domain(device_context::DeviceContext& ctx);
#endif

View File

@@ -1,26 +1,29 @@
extern "C" cudaError_t ${FIELD}_create_optimized_poseidon_constants_cuda(
int arity,
int full_rounds_half,
int partial_rounds,
const ${FIELD}::scalar_t* constants,
device_context::DeviceContext& ctx,
poseidon::PoseidonConstants<${FIELD}::scalar_t>* poseidon_constants);
extern "C" cudaError_t ${FIELD}_poseidon_create_cuda(
poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const ${FIELD}::scalar_t* round_constants,
const ${FIELD}::scalar_t* mds_matrix,
const ${FIELD}::scalar_t* non_sparse_matrix,
const ${FIELD}::scalar_t* sparse_matrices,
const ${FIELD}::scalar_t domain_tag,
device_context::DeviceContext& ctx);
extern "C" cudaError_t ${FIELD}_init_optimized_poseidon_constants_cuda(
int arity, device_context::DeviceContext& ctx, poseidon::PoseidonConstants<${FIELD}::scalar_t>* constants);
extern "C" cudaError_t ${FIELD}_poseidon_load_cuda(
poseidon::Poseidon<${FIELD}::scalar_t>** poseidon,
unsigned int arity,
device_context::DeviceContext& ctx);
extern "C" cudaError_t ${FIELD}_poseidon_hash_cuda(
${FIELD}::scalar_t* input,
extern "C" cudaError_t ${FIELD}_poseidon_hash_many_cuda(
const poseidon::Poseidon<${FIELD}::scalar_t>* poseidon,
const ${FIELD}::scalar_t* inputs,
${FIELD}::scalar_t* output,
int number_of_states,
int arity,
const poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
poseidon::PoseidonConfig& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t ${FIELD}_build_poseidon_merkle_tree(
const ${FIELD}::scalar_t* leaves,
${FIELD}::scalar_t* digests,
uint32_t height,
int arity,
poseidon::PoseidonConstants<${FIELD}::scalar_t>& constants,
merkle::TreeBuilderConfig& config);
extern "C" cudaError_t
${FIELD}_poseidon_delete_cuda(poseidon::Poseidon<${FIELD}::scalar_t>* poseidon);

View File

@@ -1,30 +1,34 @@
extern "C" cudaError_t ${FIELD}_create_poseidon2_constants_cuda(
int width,
int alpha,
int internal_rounds,
int external_rounds,
extern "C" cudaError_t ${FIELD}_poseidon2_create_cuda(
poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const ${FIELD}::scalar_t* round_constants,
const ${FIELD}::scalar_t* internal_matrix_diag,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t ${FIELD}_init_poseidon2_constants_cuda(
int width,
extern "C" cudaError_t ${FIELD}_poseidon2_load_cuda(
poseidon2::Poseidon2<${FIELD}::scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
poseidon2::MdsType mds_type,
poseidon2::DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* poseidon_constants);
device_context::DeviceContext& ctx
);
extern "C" cudaError_t ${FIELD}_poseidon2_hash_cuda(
const ${FIELD}::scalar_t* input,
extern "C" cudaError_t ${FIELD}_poseidon2_hash_many_cuda(
const poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon,
const ${FIELD}::scalar_t* inputs,
${FIELD}::scalar_t* output,
int number_of_states,
int width,
const poseidon2::Poseidon2Constants<${FIELD}::scalar_t>& constants,
poseidon2::Poseidon2Config& config);
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg);
extern "C" cudaError_t ${FIELD}_release_poseidon2_constants_cuda(
poseidon2::Poseidon2Constants<${FIELD}::scalar_t>* constants,
device_context::DeviceContext& ctx);
extern "C" cudaError_t
${FIELD}_poseidon2_delete_cuda(poseidon2::Poseidon2<${FIELD}::scalar_t>* poseidon, device_context::DeviceContext& ctx);

View File

@@ -0,0 +1,16 @@
extern "C" cudaError_t ${FIELD}_build_merkle_tree(
const ${FIELD}::scalar_t* leaves,
${FIELD}::scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* bottom_layer,
const merkle_tree::TreeBuilderConfig& tree_config);
extern "C" cudaError_t ${FIELD}_mmcs_commit_cuda(
const matrix::Matrix<${FIELD}::scalar_t>* leaves,
unsigned int number_of_inputs,
${FIELD}::scalar_t* digests,
const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* hasher,
const hash::SpongeHasher<${FIELD}::scalar_t, ${FIELD}::scalar_t>* compression,
const merkle_tree::TreeBuilderConfig& tree_config);

View File

@@ -796,6 +796,14 @@ public:
return r;
}
HOST_DEVICE_INLINE Field& operator=(Field const& other)
{
for (int i = 0; i < TLC; i++) {
this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
}
return *this;
}
friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys)
{
Wide xy = mul_wide(xs, ys); // full mult

View File

@@ -14,7 +14,7 @@ namespace m31 {
HOST_DEVICE_INLINE MersenneField(storage<CONFIG::limbs_count> x) : Field<CONFIG>{x} {}
HOST_DEVICE_INLINE MersenneField(const Field<CONFIG>& other) : Field<CONFIG>(other) {}
static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero.limbs[0]); }
static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero); }
static constexpr HOST_DEVICE_INLINE MersenneField one() { return MersenneField(CONFIG::one.limbs[0]); }

View File

@@ -3,6 +3,7 @@
#define DEVICE_CONTEXT_H
#include <cuda_runtime.h>
#include "gpu-utils/error_handler.cuh"
namespace device_context {
@@ -30,6 +31,28 @@ namespace device_context {
};
}
} // namespace device_context
// checking whether a pointer is on host or device and asserts device matches provided device
static bool is_host_ptr(const void* p, int device_id = 0)
{
cudaPointerAttributes attributes;
CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
const bool is_on_cur_device = !is_on_host && attributes.device == device_id;
const bool is_valid_ptr = is_on_host || is_on_cur_device;
if (!is_valid_ptr) { THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid ptr"); }
return is_on_host;
}
static int get_cuda_device(const void* p)
{
cudaPointerAttributes attributes;
CHK_STICKY(cudaPointerGetAttributes(&attributes, p));
const bool is_on_host = attributes.type == cudaMemoryTypeHost ||
attributes.type == cudaMemoryTypeUnregistered; // unregistered is host memory
return is_on_host ? -1 : attributes.device;
}
} // namespace device_context
#endif

View File

@@ -0,0 +1,176 @@
#pragma once
#ifndef HASH_H
#define HASH_H
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "matrix/matrix.cuh"
#include <cassert>
using matrix::Matrix;
/**
* @namespace hash
* Includes classes and methods for describing hash functions.
*/
namespace hash {
/**
* @struct SpongeConfig
* Encodes sponge hash operations parameters.
*/
struct SpongeConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool
are_outputs_on_device; /**< True if outputs are on device and false if they're on host. Default value: false. */
bool is_async; /**< Whether to run the hash operations asynchronously. If set to `true`, the functions will be
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false,
* functions will block the current CPU thread. */
};
/**
* A function that returns the default value of [SpongeConfig](@ref SpongeConfig) for the [SpongeHasher](@ref
* SpongeHasher) class.
* @return Default value of [SpongeConfig](@ref SpongeConfig).
*/
static SpongeConfig
default_sponge_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
{
SpongeConfig config = {
ctx, // ctx
false, // are_inputs_on_device
false, // are_outputs_on_device
false, // is_async
};
return config;
}
/**
* @class SpongeHasher
*
* Can be inherited by a cryptographic permutation function to create a
* [sponge](https://en.wikipedia.org/wiki/Sponge_function) construction out of it.
*
* @tparam PreImage type of inputs elements
* @tparam Image type of state elements. Also used to describe the type of hash output
*/
template <typename PreImage, typename Image>
class SpongeHasher
{
public:
/// @brief the width of permutation state
const unsigned int width;
/// @brief how many elements a state can fit per 1 permutation. Used with domain separation.
const unsigned int preimage_max_length;
/// @brief portion of the state to absorb input into, or squeeze output from
const unsigned int rate;
/// @brief start squeezing from this offset. Used with domain separation.
const unsigned int offset;
SpongeHasher(unsigned int width, unsigned int preimage_max_length, unsigned int rate, unsigned int offset)
: width(width), preimage_max_length(preimage_max_length), rate(rate), offset(offset)
{
assert(
rate * sizeof(PreImage) <= preimage_max_length * sizeof(Image) &&
"Input rate can not be bigger than preimage max length");
}
virtual cudaError_t hash_2d(
const Matrix<PreImage>* inputs,
Image* states,
unsigned int number_of_inputs,
unsigned int output_len,
uint64_t number_of_rows,
const device_context::DeviceContext& ctx) const
{
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Absorb 2d is not implemented for this hash");
return cudaError_t::cudaSuccess;
};
virtual cudaError_t compress_and_inject(
const Matrix<PreImage>* matrices_to_inject,
unsigned int number_of_inputs,
uint64_t number_of_rows,
const Image* prev_layer,
Image* next_layer,
unsigned int digest_elements,
const device_context::DeviceContext& ctx) const
{
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Compress and inject is not implemented for this hash");
return cudaError_t::cudaSuccess;
}
/// @brief Permute aligned input and do squeeze
/// @param input pointer to input allocated on-device
/// @param out pointer to output allocated on-device
cudaError_t compress_many(
const Image* input,
Image* out,
unsigned int number_of_states,
unsigned int output_len,
const SpongeConfig& cfg) const
{
return hash_many((const PreImage*)input, out, number_of_states, width, output_len, cfg);
}
virtual cudaError_t run_hash_many_kernel(
const PreImage* input,
Image* output,
unsigned int number_of_states,
unsigned int input_len,
unsigned int output_len,
const device_context::DeviceContext& ctx) const
{
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Hash many kernel is not implemented for this hash");
return cudaError_t::cudaSuccess;
};
cudaError_t hash_many(
const PreImage* input,
Image* output,
unsigned int number_of_states,
unsigned int input_len,
unsigned int output_len,
const SpongeConfig& cfg) const
{
const PreImage* d_input;
PreImage* d_alloc_input;
Image* d_output;
if (!cfg.are_inputs_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_input, number_of_states * input_len * sizeof(PreImage), cfg.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(
d_alloc_input, input, number_of_states * input_len * sizeof(PreImage), cudaMemcpyHostToDevice,
cfg.ctx.stream));
d_input = d_alloc_input;
} else {
d_input = input;
}
if (!cfg.are_outputs_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_output, number_of_states * output_len * sizeof(Image), cfg.ctx.stream));
} else {
d_output = output;
}
CHK_IF_RETURN(run_hash_many_kernel(d_input, d_output, number_of_states, input_len, output_len, cfg.ctx));
if (!cfg.are_inputs_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_input, cfg.ctx.stream)); }
if (!cfg.are_outputs_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(
output, d_output, number_of_states * output_len * sizeof(Image), cudaMemcpyDeviceToHost, cfg.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_output, cfg.ctx.stream));
}
if (!cfg.is_async) CHK_IF_RETURN(cudaStreamSynchronize(cfg.ctx.stream));
return CHK_LAST();
};
};
} // namespace hash
#endif

View File

@@ -6,6 +6,10 @@
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "hash/hash.cuh"
using namespace hash;
namespace keccak {
/**
* @struct KeccakConfig
@@ -32,25 +36,6 @@ namespace keccak {
};
return config;
}
/**
* Compute the keccak hash over a sequence of preimages.
* Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
* / 64} u64
* @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
* @tparam D - number of bits of output. Only multiples of 64 are supported.
* @param input a pointer to the input data. May be allocated on device or on host, regulated
* by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
* number_of_blocks)}.
* @param input_block_size - size of each input block in bytes. Should be divisible by 8.
* @param number_of_blocks number of input and output blocks. One GPU thread processes one block
* @param output a pointer to the output data. May be allocated on device or on host, regulated
* by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
* number_of_blocks)}
*/
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config);
} // namespace keccak
#endif

View File

@@ -0,0 +1,14 @@
#pragma once
#ifndef MATRIX_H
#define MATRIX_H
namespace matrix {
template <typename T>
struct Matrix {
T* values;
size_t width;
size_t height;
};
} // namespace matrix
#endif

View File

@@ -0,0 +1,128 @@
#pragma once
#ifndef MERKLE_H
#define MERKLE_H
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "utils/utils.h"
#include "hash/hash.cuh"
#include "matrix/matrix.cuh"
#include <vector>
#include <numeric>
#include <iostream>
#include <math.h>
using namespace hash;
using matrix::Matrix;
/**
* @namespace merkle_tree
* Implementation of the [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree) builder,
* parallelized for the use on GPU
*/
namespace merkle_tree {
static constexpr size_t GIGA = 1024 * 1024 * 1024;
/// Bytes per stream
static constexpr uint64_t STREAM_CHUNK_SIZE = GIGA;
/// Flattens the tree digests and sum them up to get
/// the memory needed to contain all the digests
static size_t get_digests_len(uint32_t height, uint32_t arity, uint32_t digest_elements)
{
size_t digests_len = 0;
size_t row_length = digest_elements;
for (int i = 0; i <= height; i++) {
digests_len += row_length;
row_length *= arity;
}
return digests_len;
}
template <typename T>
void swap(T** r, T** s)
{
T* t = *r;
*r = *s;
*s = t;
}
static unsigned int get_height(uint64_t number_of_elements)
{
unsigned int height = 0;
while (number_of_elements >>= 1)
++height;
return height;
}
/**
* @struct TreeBuilderConfig
* Struct that encodes various Tree builder parameters.
*/
struct TreeBuilderConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
unsigned int arity;
unsigned int
keep_rows; /**< How many rows of the Merkle tree rows should be written to output. '0' means all of them */
unsigned int
digest_elements; /** @param digest_elements the size of output for each bottom layer hash and compression.
* Will also be equal to the size of the root of the tree. Default value 1 */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool
are_outputs_on_device; /**< True if outputs are on device and false if they're on host. Default value: false. */
bool is_async; /**< Whether to run the tree builder asynchronously. If set to `true`, the build_merkle_tree
* function will be non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
* function will block the current CPU thread. */
};
static TreeBuilderConfig
default_merkle_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
{
TreeBuilderConfig config = {
ctx, // ctx
2, // arity
0, // keep_rows
1, // digest_elements
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // is_async
};
return config;
}
/**
* Builds the Merkle tree
*
* @param leaves a pointer to the leaves layer. May be allocated on device or on host, regulated by the config
* Expected to have arity ^ (height) * input_block_len elements
* @param digests a pointer to the digests storage. May only be allocated on the host
* Expected to have `sum(digests_len * (arity ^ (i))) for i in [0..keep_rows]`
* @param height the height of the merkle tree
* @param input_block_len the size of input vectors at the bottom layer of the tree
* # Algorithm
* The function will split large tree into many subtrees of size that will fit `STREAM_CHUNK_SIZE`.
* Each subtree is build in it's own stream (there is a maximum number of streams)
* After all subtrees are constructed - the function will combine the resulting sub-digests into the final top-tree
*/
template <typename Leaf, typename Digest>
cudaError_t build_merkle_tree(
const Leaf* inputs,
Digest* digests,
const SpongeHasher<Leaf, Digest>& compression,
const SpongeHasher<Leaf, Digest>& bottom_layer,
const TreeBuilderConfig& config);
template <typename Leaf, typename Digest>
cudaError_t mmcs_commit(
const Matrix<Leaf>* inputs,
const unsigned int number_of_inputs,
Digest* digests,
const SpongeHasher<Leaf, Digest>& hasher,
const SpongeHasher<Leaf, Digest>& compression,
const TreeBuilderConfig& tree_config);
} // namespace merkle_tree
#endif

View File

@@ -0,0 +1,114 @@
#pragma once
#ifndef POSEIDON_CONSTANTS_H
#define POSEIDON_CONSTANTS_H
#include <cstdint>
namespace poseidon {
#define FIRST_FULL_ROUNDS true
#define SECOND_FULL_ROUNDS false
/**
* For most of the Poseidon configurations this is the case
* TODO: Add support for different full rounds numbers
*/
const int FULL_ROUNDS_DEFAULT = 4;
/**
* @struct PoseidonConstants
* This constants are enough to define a Poseidon instantce
* @param round_constants A pointer to round constants allocated on the device
* @param mds_matrix A pointer to an mds matrix allocated on the device
* @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
* @param sparse_matrices A pointer to sparse matrices allocated on the device
*/
template <typename S>
struct PoseidonConstants {
unsigned int arity;
unsigned int alpha;
unsigned int partial_rounds;
unsigned int full_rounds_half;
S* round_constants = nullptr;
S* mds_matrix = nullptr;
S* non_sparse_matrix = nullptr;
S* sparse_matrices = nullptr;
S domain_tag = S::zero();
PoseidonConstants() = default;
PoseidonConstants(const PoseidonConstants& other) = default;
PoseidonConstants<S>& operator=(PoseidonConstants<S> const& other)
{
this->arity = other.arity;
this->alpha = other.alpha;
this->partial_rounds = other.partial_rounds;
this->full_rounds_half = other.full_rounds_half;
this->round_constants = other.round_constants;
this->mds_matrix = other.mds_matrix;
this->non_sparse_matrix = other.non_sparse_matrix;
this->sparse_matrices = other.sparse_matrices;
this->domain_tag = other.domain_tag;
return *this;
}
};
/**
* @class PoseidonKernelsConfiguration
* Describes the logic of deriving CUDA kernels parameters
* such as the number of threads and the number of blocks
*/
class PoseidonKernelsConfiguration
{
public:
// The logic behind this is that 1 thread only works on 1 element
// We have {width} elements in each state, and {number_of_states} states total
static int number_of_threads(unsigned int width) { return 256 / width * width; }
// The partial rounds operates on the whole state, so we define
// the parallelism params for processing a single hash preimage per thread
static const int singlehash_block_size = 128;
static int hashes_per_block(unsigned int width) { return number_of_threads(width) / width; }
static int number_of_full_blocks(unsigned int width, size_t number_of_states)
{
int total_number_of_threads = number_of_states * width;
return total_number_of_threads / number_of_threads(width) +
static_cast<bool>(total_number_of_threads % number_of_threads(width));
}
static int number_of_singlehash_blocks(size_t number_of_states)
{
return number_of_states / singlehash_block_size + static_cast<bool>(number_of_states % singlehash_block_size);
}
};
using PKC = PoseidonKernelsConfiguration;
template <typename S>
cudaError_t create_optimized_poseidon_constants(
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const S* round_constants,
const S* mds_matrix,
const S* non_sparse_matrix,
const S* sparse_matrices,
const S domain_tag,
PoseidonConstants<S>* poseidon_constants,
device_context::DeviceContext& ctx);
/**
* Loads pre-calculated optimized constants, moves them to the device
*/
template <typename S>
cudaError_t
init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
template <typename S>
cudaError_t release_optimized_poseidon_constants(PoseidonConstants<S>* constants, device_context::DeviceContext& ctx);
} // namespace poseidon
#endif

View File

@@ -8,17 +8,18 @@ import numpy as np
from poseidon import round_constants as rc, round_numbers as rn
# Modify these
arity = 11
p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47 # grumpkin
arity = 2
p = 2 ** 31 - 1 # grumpkin
# p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47 # grumpkin
# p = 0x73EDA753299D7D483339D80809A1D80553BDA402FFFE5BFEFFFFFFFF00000001 # bls12-381
# p = 0x12ab655e9a2ca55660b44d1e5c37b00159aa76fed00000010a11800000000001 # bls12-377
# p = 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001 # bn254
# p = 0x1ae3a4617c510eac63b05c06ca1493b1a22d9f300f5138f1ef3622fba094800170b5d44300000008508c00000000001 # bw6-761
prime_bit_len = 255
field_bytes = 32
prime_bit_len = 31
field_bytes = 4
# leave set to -1 if not sure
full_round = -1
full_round = 8
half_full_round = full_round // 2
# leave set to -1 if not sure
partial_round = -1
@@ -31,12 +32,12 @@ security_level = 128
# F = GF(p)
# F.primitive_element()
#
# primitive_element = None
primitive_element = None
# primitive_element = 7 # bls12-381
# primitive_element = 22 # bls12-377
# primitive_element = 5 # bn254
# primitive_element = 15 # bw6-761
primitive_element = 3 # grumpkin
# primitive_element = 3 # grumpkin
# currently we only support alpha 5, if you need alpha other than 5 - feal free to reach out
alpha = 5

View File

@@ -0,0 +1,508 @@
#pragma once
#ifndef M31_POSEIDON_H
#define M31_POSEIDON_H
namespace poseidon_constants_m31 {
/**
* This inner namespace contains optimized constants for running Poseidon.
* These constants were generated using an algorithm defined at
* https://spec.filecoin.io/algorithms/crypto/poseidon/
* The number in the name corresponds to the arity of hash function
* Each array contains:
* RoundConstants | MDSMatrix | Non-sparse matrix | Sparse matrices
*/
int partial_rounds_2 = 7;
int partial_rounds_4 = 11;
int partial_rounds_8 = 12;
int partial_rounds_11 = 12;
unsigned char poseidon_constants_2[] = {
0x33, 0x8b, 0x6d, 0x47, 0xbb, 0x97, 0x11, 0x67, 0x92, 0x9d, 0x55, 0x2d,
0xee, 0x1e, 0x2e, 0x45, 0xfe, 0x35, 0x0e, 0x25, 0x7e, 0xc3, 0x4f, 0x70,
0x4d, 0x0a, 0x8c, 0x18, 0xd9, 0x43, 0xa4, 0x61, 0xfb, 0x14, 0xd9, 0x14,
0x99, 0x13, 0xb9, 0x30, 0xec, 0x3b, 0x8c, 0x16, 0xcc, 0xb2, 0x0b, 0x2e,
0x9e, 0x18, 0xbf, 0x26, 0xb6, 0xb7, 0x2a, 0x44, 0x61, 0x29, 0xdb, 0x21,
0x18, 0x84, 0x03, 0x4e, 0xef, 0x95, 0xf9, 0x45, 0xe3, 0xd8, 0xf2, 0x46,
0x82, 0xb4, 0xc9, 0x5e, 0x5f, 0xf3, 0xb2, 0x4f, 0x61, 0x80, 0x50, 0x0f,
0x0d, 0x7f, 0xe3, 0x1b, 0x23, 0xbd, 0x05, 0x2f, 0x0f, 0xb1, 0x60, 0x67,
0xd8, 0x85, 0xdf, 0x57, 0x0c, 0x8c, 0xdf, 0x50, 0x9e, 0x65, 0x3c, 0x58,
0x07, 0xbd, 0x29, 0x7e, 0xc5, 0xe5, 0xa7, 0x5a, 0x5a, 0x4b, 0x0c, 0x29,
0x89, 0x9d, 0x14, 0x11, 0x8c, 0x20, 0xcb, 0x76, 0x4d, 0x56, 0x2d, 0x4a,
0x10, 0xda, 0xaf, 0x0a, 0x65, 0x9d, 0x98, 0x3e, 0xa1, 0xac, 0x57, 0x46,
0xcb, 0xe8, 0xfc, 0x5b, 0xd4, 0x43, 0x4b, 0x63, 0x1b, 0x13, 0x4b, 0x1f,
0xed, 0xac, 0xbf, 0x30, 0x27, 0x15, 0xac, 0x53, 0x4b, 0x27, 0x61, 0x3e,
0x37, 0xc3, 0x65, 0x74, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x20,
0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, 0x20, 0x33, 0x33, 0x33, 0x33,
0xaa, 0xaa, 0xaa, 0x6a, 0x33, 0x33, 0x33, 0x33, 0xaa, 0xaa, 0xaa, 0x6a,
0x6d, 0xdb, 0xb6, 0x6d, 0x55, 0x55, 0x55, 0x55, 0xc0, 0x72, 0x8d, 0x36,
0x2c, 0xe5, 0xc0, 0x51, 0x00, 0x00, 0x00, 0x20, 0x0b, 0xd5, 0x67, 0x6c,
0x6c, 0x67, 0x2c, 0x13, 0x33, 0x33, 0x33, 0x33, 0x6c, 0x67, 0x2c, 0x13,
0xe6, 0xb8, 0x2c, 0x62, 0x55, 0x55, 0x55, 0x55, 0x15, 0x1f, 0xaf, 0x6a,
0xd9, 0xa8, 0x14, 0x44, 0xae, 0xb0, 0x38, 0x4b, 0x17, 0x76, 0xd9, 0x39,
0x55, 0x55, 0x55, 0x55, 0x28, 0xef, 0x9d, 0x4f, 0xc7, 0x3b, 0xa6, 0x24,
0x84, 0x5b, 0x79, 0x6f, 0xde, 0x4f, 0x8f, 0x3d, 0x55, 0x55, 0x55, 0x55,
0x54, 0xc2, 0xb2, 0x00, 0x5a, 0xed, 0x68, 0x0c, 0xeb, 0xd4, 0xc4, 0x61,
0x02, 0x8c, 0x85, 0x27, 0x55, 0x55, 0x55, 0x55, 0xe4, 0xc5, 0xbd, 0x0a,
0xf6, 0xec, 0x75, 0x26, 0xe0, 0xdb, 0xd8, 0x52, 0xdf, 0x28, 0xff, 0x33,
0x55, 0x55, 0x55, 0x55, 0xac, 0x68, 0x06, 0x00, 0xc9, 0xff, 0x91, 0x19,
0xb1, 0x12, 0x2b, 0x19, 0xa2, 0xdd, 0x47, 0x39, 0x55, 0x55, 0x55, 0x55,
0xd5, 0x03, 0x00, 0x00, 0x45, 0xc8, 0xcc, 0x4c, 0x55, 0x55, 0x55, 0x35,
0x8d, 0xd6, 0x68, 0x3d, 0x55, 0x55, 0x55, 0x55, 0x03, 0x00, 0x00, 0x00,
0x64, 0x66, 0x66, 0x26, 0x00, 0x00, 0x00, 0x20, 0x33, 0x33, 0x33, 0x33
};
unsigned char poseidon_constants_4[] = {
0xdb, 0x64, 0xa5, 0x32, 0xd6, 0x3d, 0x12, 0x6e, 0x65, 0x66, 0x46, 0x59,
0x2a, 0x64, 0x51, 0x3b, 0xaf, 0xbe, 0x72, 0x0b, 0x66, 0x5f, 0x5c, 0x6c,
0x66, 0x11, 0x8c, 0x61, 0x99, 0x24, 0x99, 0x14, 0x1d, 0x5f, 0x67, 0x0a,
0x4d, 0xab, 0xc4, 0x1e, 0x43, 0xb2, 0x09, 0x58, 0xc0, 0x27, 0x4c, 0x5b,
0xf0, 0x0c, 0xf5, 0x12, 0xc9, 0x2f, 0x88, 0x4f, 0x59, 0x52, 0x5b, 0x6a,
0x73, 0x90, 0x55, 0x5b, 0xaf, 0x47, 0x55, 0x0d, 0xa7, 0xc2, 0x0c, 0x6e,
0xe6, 0xd6, 0x4e, 0x30, 0x9e, 0x75, 0x47, 0x12, 0xca, 0x93, 0xd1, 0x5b,
0x64, 0x27, 0xfc, 0x60, 0x6c, 0x16, 0x52, 0x20, 0xf5, 0xe0, 0x01, 0x15,
0x27, 0xf9, 0x96, 0x7f, 0xa0, 0x38, 0xad, 0x3c, 0x95, 0xd3, 0xe4, 0x32,
0x57, 0x95, 0x5a, 0x6b, 0x12, 0xcc, 0xdc, 0x18, 0x2b, 0xdd, 0xa4, 0x66,
0xbf, 0xe7, 0x96, 0x15, 0x85, 0x87, 0x6a, 0x1f, 0x15, 0x19, 0x9c, 0x65,
0xef, 0x24, 0xaa, 0x2c, 0x3f, 0x6b, 0xbc, 0x6b, 0x54, 0x24, 0x2c, 0x17,
0xf1, 0x7a, 0x8d, 0x57, 0x90, 0xa4, 0xd4, 0x4a, 0x12, 0x06, 0x77, 0x6a,
0xe8, 0x6b, 0xd9, 0x51, 0x80, 0x72, 0xa1, 0x31, 0xce, 0xa8, 0x59, 0x10,
0x0c, 0x90, 0xd4, 0x10, 0x8e, 0x60, 0x54, 0x1c, 0xe7, 0xfd, 0x42, 0x3a,
0x73, 0xc1, 0xcc, 0x4f, 0x58, 0xbb, 0x99, 0x7c, 0xd2, 0x51, 0xda, 0x43,
0xea, 0x6e, 0xe8, 0x16, 0xb2, 0x51, 0x53, 0x61, 0x7e, 0x68, 0x44, 0x3c,
0x33, 0x33, 0x33, 0x33, 0xaa, 0xaa, 0xaa, 0x6a, 0x6d, 0xdb, 0xb6, 0x6d,
0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71, 0xaa, 0xaa, 0xaa, 0x6a,
0x6d, 0xdb, 0xb6, 0x6d, 0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71,
0x99, 0x99, 0x99, 0x59, 0x6d, 0xdb, 0xb6, 0x6d, 0x00, 0x00, 0x00, 0x10,
0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59,
0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35, 0x71, 0x1c, 0xc7, 0x71,
0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35,
0xd8, 0x89, 0x9d, 0x58, 0x33, 0x33, 0x33, 0x33, 0xae, 0x9d, 0xba, 0x61,
0x09, 0xf2, 0xee, 0x53, 0x5e, 0x5c, 0xe8, 0x61, 0x8e, 0x1a, 0x60, 0x6c,
0xaa, 0xaa, 0xaa, 0x6a, 0xff, 0x1a, 0xb7, 0x09, 0x1d, 0x84, 0x75, 0x5e,
0x88, 0x5e, 0x36, 0x25, 0x6b, 0xd4, 0xdd, 0x65, 0x6d, 0xdb, 0xb6, 0x6d,
0x1d, 0x84, 0x75, 0x5e, 0x10, 0x9d, 0x2d, 0x63, 0xa7, 0x62, 0xfc, 0x1f,
0xe2, 0x43, 0x63, 0x14, 0x00, 0x00, 0x00, 0x10, 0x88, 0x5e, 0x36, 0x25,
0xa7, 0x62, 0xfc, 0x1f, 0x47, 0xa0, 0x19, 0x6f, 0x48, 0x1f, 0x4e, 0x22,
0x71, 0x1c, 0xc7, 0x71, 0x6b, 0xd4, 0xdd, 0x65, 0xe2, 0x43, 0x63, 0x14,
0x48, 0x1f, 0x4e, 0x22, 0xb7, 0x4e, 0x73, 0x01, 0x33, 0x33, 0x33, 0x33,
0x84, 0xdd, 0xf7, 0x08, 0x6f, 0xc5, 0x14, 0x63, 0xb6, 0x22, 0x01, 0x3d,
0xcd, 0xab, 0x7d, 0x62, 0xac, 0x7e, 0x61, 0x57, 0x40, 0x6b, 0xc5, 0x45,
0x77, 0xbc, 0x02, 0x18, 0x8c, 0x66, 0xda, 0x74, 0x33, 0x33, 0x33, 0x33,
0x01, 0x9d, 0x33, 0x55, 0xed, 0x7d, 0x75, 0x63, 0x41, 0x92, 0x33, 0x76,
0x6b, 0xd5, 0x10, 0x23, 0x1a, 0xc4, 0x49, 0x5b, 0x0c, 0x86, 0x5a, 0x60,
0x23, 0xe5, 0xd8, 0x1c, 0x43, 0xe9, 0xe2, 0x0d, 0x33, 0x33, 0x33, 0x33,
0x1b, 0x68, 0xec, 0x17, 0x0e, 0x3f, 0x34, 0x1a, 0xb0, 0x28, 0xe9, 0x6c,
0xc0, 0xf7, 0x3e, 0x79, 0xdc, 0x08, 0x9e, 0x32, 0x45, 0xde, 0xea, 0x73,
0x7a, 0xc4, 0xb4, 0x0d, 0x65, 0xb6, 0x61, 0x04, 0x33, 0x33, 0x33, 0x33,
0x41, 0x01, 0x02, 0x6b, 0xd8, 0x62, 0x6b, 0x47, 0x47, 0xd9, 0x7e, 0x72,
0x4f, 0x80, 0x31, 0x54, 0x8b, 0x5e, 0x3e, 0x26, 0x64, 0x16, 0xe2, 0x51,
0xf4, 0xa6, 0xed, 0x35, 0xc3, 0xe9, 0xc5, 0x41, 0x33, 0x33, 0x33, 0x33,
0xd5, 0x3f, 0xed, 0x11, 0xf5, 0x0f, 0x56, 0x41, 0xf6, 0x0d, 0xf3, 0x78,
0xb0, 0x78, 0xa1, 0x7d, 0x5d, 0x33, 0xc4, 0x5e, 0xa6, 0xd9, 0x47, 0x4c,
0x07, 0xc3, 0x30, 0x5a, 0x91, 0x10, 0x31, 0x20, 0x33, 0x33, 0x33, 0x33,
0xa5, 0xec, 0xe5, 0x25, 0xe6, 0xa7, 0x4e, 0x01, 0xee, 0x3a, 0xe7, 0x62,
0x02, 0xfd, 0xf9, 0x08, 0xdd, 0x91, 0x3f, 0x2d, 0xca, 0xbc, 0xb5, 0x2c,
0x54, 0x9e, 0xd4, 0x78, 0x6b, 0x18, 0x94, 0x21, 0x33, 0x33, 0x33, 0x33,
0xe6, 0xb3, 0xd2, 0x2e, 0x49, 0xdb, 0xa8, 0x52, 0x5f, 0x6a, 0x75, 0x59,
0xd5, 0x45, 0x5c, 0x73, 0x40, 0xe4, 0xd8, 0x2a, 0x8c, 0xe6, 0xda, 0x50,
0x5f, 0x4f, 0x18, 0x5d, 0xf4, 0xa4, 0xf4, 0x46, 0x33, 0x33, 0x33, 0x33,
0x3e, 0x90, 0x5b, 0x3a, 0x55, 0x96, 0x22, 0x7c, 0xd9, 0x64, 0x36, 0x4e,
0x0b, 0xec, 0x66, 0x65, 0xac, 0x55, 0xa9, 0x19, 0x50, 0x87, 0x49, 0x1a,
0x1f, 0x78, 0x89, 0x36, 0x25, 0x2a, 0x06, 0x55, 0x33, 0x33, 0x33, 0x33,
0x6b, 0xf1, 0x61, 0x67, 0x67, 0x00, 0xc5, 0x24, 0x9e, 0xd1, 0x94, 0x6f,
0xbf, 0x8b, 0xaf, 0x2d, 0x69, 0x9c, 0xb7, 0x62, 0xf8, 0x0a, 0x43, 0x13,
0x3c, 0xc0, 0x48, 0x3e, 0x9f, 0x3f, 0xa8, 0x2c, 0x33, 0x33, 0x33, 0x33,
0x9d, 0x5b, 0xb2, 0x2b, 0x62, 0x05, 0x39, 0x20, 0x52, 0x1f, 0xe8, 0x05,
0x1b, 0x24, 0xc0, 0x13, 0x11, 0x11, 0x11, 0x11, 0x9c, 0x6a, 0x35, 0x45,
0xf6, 0x7f, 0x5c, 0x4c, 0x9f, 0xc4, 0x8f, 0x1f, 0x33, 0x33, 0x33, 0x33,
0xb1, 0xaa, 0xaa, 0x2a, 0xcb, 0xb6, 0x6d, 0x5b, 0x34, 0x49, 0x92, 0x24,
0x90, 0x65, 0x59, 0x56, 0xaa, 0xaa, 0xaa, 0x6a, 0x6d, 0xdb, 0xb6, 0x6d,
0x00, 0x00, 0x00, 0x10, 0x71, 0x1c, 0xc7, 0x71
};
unsigned char poseidon_constants_8[] = {
0x90, 0xaf, 0x71, 0x3e, 0xa3, 0xbe, 0x5a, 0x30, 0xd4, 0x1b, 0x6f, 0x5d,
0xeb, 0x36, 0x6b, 0x53, 0x14, 0xc0, 0x30, 0x13, 0xd5, 0xf8, 0x0b, 0x1c,
0xa8, 0x66, 0xf1, 0x3c, 0xbd, 0x64, 0xa3, 0x6c, 0x06, 0x5e, 0x95, 0x7c,
0xee, 0xc4, 0x0a, 0x0f, 0x37, 0x03, 0xba, 0x6d, 0x20, 0x85, 0xf1, 0x2c,
0xee, 0x59, 0x21, 0x11, 0x42, 0xae, 0xb7, 0x3c, 0x73, 0xb4, 0xd6, 0x71,
0x6a, 0x29, 0x40, 0x03, 0x86, 0xd8, 0x32, 0x68, 0x61, 0x62, 0x62, 0x32,
0x44, 0x5d, 0xcc, 0x38, 0x76, 0x0f, 0xbc, 0x1f, 0xc9, 0x6e, 0x67, 0x1d,
0x95, 0x35, 0x10, 0x79, 0x45, 0xaa, 0x0f, 0x7c, 0x73, 0xfa, 0x5d, 0x3f,
0x53, 0xf2, 0xdc, 0x21, 0x37, 0xfa, 0x15, 0x04, 0xfd, 0x31, 0x3d, 0x5d,
0x5d, 0xe6, 0x1d, 0x4a, 0xb3, 0x2b, 0xa2, 0x07, 0x2d, 0x48, 0x07, 0x2b,
0x92, 0x1c, 0x31, 0x52, 0x6c, 0xd3, 0x32, 0x2f, 0x0f, 0xdd, 0x82, 0x7d,
0x41, 0x0e, 0x81, 0x7e, 0x60, 0xfb, 0x49, 0x7b, 0xe5, 0x39, 0x3d, 0x75,
0x6d, 0xcf, 0x02, 0x77, 0x0d, 0xf6, 0xf8, 0x0c, 0x43, 0xae, 0x62, 0x5e,
0x26, 0x36, 0x9e, 0x3a, 0x10, 0xe3, 0x59, 0x4b, 0x3a, 0x59, 0x49, 0x73,
0x31, 0x20, 0xb9, 0x40, 0x39, 0xed, 0xaf, 0x37, 0x6d, 0x5c, 0x4c, 0x6a,
0xce, 0xca, 0xc4, 0x33, 0x53, 0x96, 0x92, 0x1d, 0xb2, 0xa1, 0xac, 0x65,
0xbb, 0x43, 0xc4, 0x16, 0xf9, 0x38, 0x10, 0x67, 0x3d, 0xbb, 0x28, 0x7a,
0x2b, 0x1e, 0x65, 0x36, 0x07, 0x14, 0x36, 0x3c, 0xcb, 0xdf, 0x03, 0x6b,
0x03, 0x7b, 0xe6, 0x67, 0x79, 0x2a, 0x08, 0x47, 0xb7, 0x8f, 0x9c, 0x7e,
0x54, 0xde, 0x08, 0x0a, 0xf8, 0x99, 0x24, 0x6f, 0x64, 0x78, 0x80, 0x5f,
0x43, 0x76, 0x77, 0x40, 0x12, 0x62, 0x71, 0x10, 0x35, 0xf5, 0xdd, 0x0a,
0x06, 0xff, 0x9b, 0x7b, 0xd8, 0x1a, 0xf3, 0x50, 0x1d, 0xc3, 0x8c, 0x60,
0xe0, 0x61, 0xf5, 0x3d, 0xf9, 0xbf, 0xe4, 0x38, 0x78, 0xbf, 0x59, 0x0e,
0xed, 0xc9, 0x4d, 0x0b, 0xb1, 0x7a, 0x10, 0x2b, 0x84, 0x27, 0x07, 0x70,
0x5d, 0xc0, 0xa4, 0x7e, 0x9c, 0xf0, 0xf6, 0x69, 0x89, 0x6c, 0xc5, 0x39,
0x4a, 0x7d, 0x5e, 0x26, 0x2f, 0x08, 0x9d, 0x05, 0xdc, 0x71, 0xec, 0x08,
0x2b, 0xca, 0x68, 0x14, 0x42, 0xf6, 0xe6, 0x0a, 0x2f, 0xa5, 0x34, 0x6d,
0x95, 0xaa, 0x80, 0x55, 0x23, 0x0f, 0x5f, 0x20, 0xbe, 0x4d, 0x0b, 0x20,
0x71, 0x1c, 0xc7, 0x71, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35,
0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
0x45, 0x17, 0x5d, 0x74, 0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58,
0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
0x71, 0x1c, 0xc7, 0x71, 0x50, 0x05, 0xd7, 0x30, 0x09, 0x94, 0x4f, 0x13,
0x11, 0x86, 0x4b, 0x61, 0x74, 0x8b, 0x94, 0x0e, 0x7e, 0x5d, 0x93, 0x27,
0xeb, 0xb6, 0x4b, 0x61, 0x90, 0x3f, 0x9b, 0x7d, 0x10, 0xe9, 0x16, 0x06,
0x99, 0x99, 0x99, 0x59, 0x4f, 0xf6, 0x15, 0x6b, 0x84, 0x8c, 0xe0, 0x5f,
0x88, 0x9e, 0xb2, 0x08, 0x32, 0x36, 0xe3, 0x25, 0x64, 0x0a, 0xf5, 0x6f,
0x80, 0xff, 0x8e, 0x6f, 0xcd, 0xb5, 0x72, 0x12, 0x90, 0xa2, 0x7a, 0x09,
0x45, 0x17, 0x5d, 0x74, 0x84, 0x8c, 0xe0, 0x5f, 0xf5, 0x67, 0x02, 0x2d,
0x71, 0x83, 0xf0, 0x55, 0x81, 0xa2, 0x81, 0x4b, 0xec, 0xff, 0xb0, 0x6b,
0x17, 0x41, 0xd6, 0x36, 0xf3, 0x16, 0x58, 0x23, 0x49, 0x90, 0xa2, 0x17,
0x55, 0x55, 0x55, 0x35, 0x88, 0x9e, 0xb2, 0x08, 0x71, 0x83, 0xf0, 0x55,
0x27, 0x2a, 0xb0, 0x29, 0x0b, 0xe4, 0x53, 0x70, 0x7f, 0xeb, 0x60, 0x74,
0xb9, 0x92, 0xa9, 0x4b, 0x51, 0x41, 0x0e, 0x56, 0x1b, 0xe4, 0x67, 0x43,
0xd8, 0x89, 0x9d, 0x58, 0x32, 0x36, 0xe3, 0x25, 0x81, 0xa2, 0x81, 0x4b,
0x0b, 0xe4, 0x53, 0x70, 0x73, 0x99, 0xf0, 0x02, 0x1a, 0xf7, 0xe1, 0x40,
0x18, 0xc4, 0x58, 0x3a, 0xcc, 0xf5, 0x0b, 0x18, 0xf0, 0x39, 0xab, 0x7a,
0xb6, 0x6d, 0xdb, 0x76, 0x64, 0x0a, 0xf5, 0x6f, 0xec, 0xff, 0xb0, 0x6b,
0x7f, 0xeb, 0x60, 0x74, 0x1a, 0xf7, 0xe1, 0x40, 0xf7, 0xfc, 0xbe, 0x7f,
0xbf, 0x63, 0xc5, 0x05, 0x15, 0x3c, 0x9f, 0x2b, 0x9b, 0x77, 0xb0, 0x44,
0x11, 0x11, 0x11, 0x11, 0x80, 0xff, 0x8e, 0x6f, 0x17, 0x41, 0xd6, 0x36,
0xb9, 0x92, 0xa9, 0x4b, 0x18, 0xc4, 0x58, 0x3a, 0xbf, 0x63, 0xc5, 0x05,
0x2f, 0x5c, 0x3c, 0x09, 0x25, 0xaf, 0xdf, 0x11, 0x21, 0x7d, 0x95, 0x58,
0x00, 0x00, 0x00, 0x08, 0xcd, 0xb5, 0x72, 0x12, 0xf3, 0x16, 0x58, 0x23,
0x51, 0x41, 0x0e, 0x56, 0xcc, 0xf5, 0x0b, 0x18, 0x15, 0x3c, 0x9f, 0x2b,
0x25, 0xaf, 0xdf, 0x11, 0x38, 0x50, 0xe9, 0x16, 0x12, 0xb8, 0xc8, 0x17,
0x0f, 0x0f, 0x0f, 0x0f, 0x90, 0xa2, 0x7a, 0x09, 0x49, 0x90, 0xa2, 0x17,
0x1b, 0xe4, 0x67, 0x43, 0xf0, 0x39, 0xab, 0x7a, 0x9b, 0x77, 0xb0, 0x44,
0x21, 0x7d, 0x95, 0x58, 0x12, 0xb8, 0xc8, 0x17, 0x5a, 0xfc, 0xf7, 0x5c,
0x71, 0x1c, 0xc7, 0x71, 0xdb, 0x50, 0x89, 0x38, 0x5f, 0x88, 0xe3, 0x32,
0x8b, 0xb4, 0x3b, 0x6c, 0x95, 0x0a, 0xf1, 0x41, 0xe6, 0x0a, 0x52, 0x7d,
0xd1, 0x0d, 0xb1, 0x57, 0x9b, 0xd2, 0xf4, 0x1d, 0x80, 0x17, 0xb2, 0x42,
0x9c, 0x40, 0x6e, 0x2f, 0x63, 0xa7, 0x42, 0x77, 0xf9, 0x37, 0xd1, 0x43,
0x98, 0xd1, 0xec, 0x50, 0x91, 0x26, 0xfa, 0x4e, 0x0c, 0x9e, 0xcc, 0x31,
0x52, 0xf4, 0x20, 0x5d, 0x2a, 0x20, 0xeb, 0x1b, 0x71, 0x1c, 0xc7, 0x71,
0x54, 0x29, 0xf4, 0x4a, 0xde, 0x91, 0xf6, 0x54, 0x8b, 0xed, 0x18, 0x26,
0x71, 0x24, 0x22, 0x34, 0xb7, 0xaf, 0x61, 0x27, 0x7a, 0x0a, 0x21, 0x7f,
0x9f, 0xfe, 0xa1, 0x53, 0x26, 0x97, 0x6b, 0x5b, 0xf4, 0xea, 0xef, 0x4a,
0x4b, 0x03, 0xa0, 0x7c, 0xe6, 0x64, 0x69, 0x47, 0x76, 0xf7, 0x2d, 0x0b,
0x6f, 0xd5, 0x2c, 0x45, 0x52, 0xc1, 0x5c, 0x46, 0x25, 0x38, 0xab, 0x79,
0x64, 0xed, 0xe7, 0x57, 0x71, 0x1c, 0xc7, 0x71, 0x94, 0xc2, 0xb7, 0x7f,
0xaf, 0x0d, 0x61, 0x4c, 0xa3, 0x86, 0x8e, 0x45, 0xdc, 0x73, 0xe3, 0x77,
0x71, 0xed, 0x21, 0x7d, 0x4b, 0x8e, 0xc7, 0x52, 0x39, 0x5d, 0x49, 0x1d,
0x75, 0x35, 0xed, 0x09, 0xc6, 0x02, 0x3b, 0x22, 0xb8, 0x91, 0x07, 0x13,
0x7f, 0xbf, 0x15, 0x7f, 0xb5, 0xbe, 0x0a, 0x5c, 0xbc, 0x75, 0x54, 0x61,
0x6c, 0x2f, 0x28, 0x5f, 0xff, 0xf0, 0x7b, 0x67, 0x11, 0x8e, 0x70, 0x29,
0x71, 0x1c, 0xc7, 0x71, 0xe6, 0xfc, 0x29, 0x07, 0xbd, 0x0c, 0x4d, 0x5f,
0x57, 0xb7, 0x87, 0x41, 0xec, 0x48, 0xda, 0x18, 0x78, 0x41, 0xb8, 0x6d,
0xde, 0x7e, 0x47, 0x5a, 0x13, 0x03, 0xc5, 0x52, 0x2e, 0xee, 0xf3, 0x3f,
0x06, 0xd0, 0xcd, 0x48, 0x77, 0x2a, 0xcd, 0x7e, 0x35, 0xee, 0x74, 0x63,
0x3e, 0x26, 0x65, 0x64, 0x37, 0xa1, 0xfb, 0x7a, 0x03, 0x44, 0xa8, 0x70,
0x2f, 0x03, 0x27, 0x1e, 0xb3, 0x02, 0x3e, 0x4a, 0x71, 0x1c, 0xc7, 0x71,
0xfd, 0xe1, 0xfe, 0x3c, 0x88, 0x1c, 0x36, 0x53, 0x36, 0x31, 0x5a, 0x32,
0x88, 0x7b, 0xa6, 0x17, 0x40, 0x31, 0xe4, 0x0a, 0xb3, 0x70, 0x8f, 0x4f,
0xc3, 0xa2, 0xd7, 0x06, 0x34, 0x9d, 0x4a, 0x71, 0x5b, 0xfa, 0x79, 0x25,
0xe8, 0x6f, 0x05, 0x65, 0xc1, 0x4a, 0xee, 0x5c, 0x9a, 0xb2, 0x83, 0x05,
0xb0, 0x89, 0x77, 0x2e, 0xc1, 0x56, 0x34, 0x08, 0x50, 0xf5, 0xde, 0x12,
0xae, 0x68, 0xc2, 0x1b, 0x71, 0x1c, 0xc7, 0x71, 0xb3, 0x84, 0x6e, 0x4f,
0xae, 0x74, 0x57, 0x4f, 0x56, 0xf3, 0xfc, 0x48, 0xfa, 0x73, 0xd7, 0x0e,
0x8a, 0xc5, 0x35, 0x4d, 0xf6, 0x26, 0x15, 0x2a, 0xcf, 0xb5, 0x2d, 0x64,
0xd1, 0x2a, 0x84, 0x43, 0xab, 0xc0, 0xec, 0x60, 0xa9, 0xbc, 0x09, 0x11,
0xfd, 0x06, 0xea, 0x1e, 0xba, 0x29, 0x77, 0x6c, 0xb1, 0x37, 0xa5, 0x42,
0x1c, 0x9b, 0x58, 0x37, 0xa8, 0xb7, 0xae, 0x3e, 0x6a, 0xf8, 0x63, 0x25,
0x71, 0x1c, 0xc7, 0x71, 0x22, 0xa0, 0x75, 0x4e, 0x17, 0x33, 0x99, 0x7c,
0x97, 0x97, 0x30, 0x04, 0xbc, 0x22, 0x6d, 0x7c, 0xb3, 0xd7, 0xd9, 0x56,
0x4e, 0xef, 0x40, 0x5e, 0x02, 0x05, 0x51, 0x1e, 0x0c, 0x32, 0xb7, 0x06,
0x41, 0x16, 0x80, 0x33, 0xc2, 0xdd, 0x8f, 0x18, 0x65, 0xa3, 0xe1, 0x4a,
0xdb, 0xb4, 0x5d, 0x78, 0xf3, 0x99, 0x48, 0x3e, 0x04, 0x5b, 0xb9, 0x09,
0xd2, 0x3d, 0x14, 0x05, 0x69, 0x50, 0xe9, 0x57, 0x71, 0x1c, 0xc7, 0x71,
0x0d, 0x72, 0x37, 0x6c, 0xe3, 0xd1, 0x57, 0x2f, 0x9e, 0xb7, 0xe1, 0x30,
0x22, 0xce, 0xe5, 0x66, 0x45, 0x7b, 0x06, 0x0e, 0x06, 0x66, 0xdd, 0x11,
0xef, 0xdf, 0x61, 0x52, 0x7d, 0xb9, 0xcf, 0x1e, 0x97, 0xbe, 0x55, 0x00,
0x94, 0xcb, 0x50, 0x7c, 0xa0, 0x83, 0x1c, 0x57, 0xf3, 0x72, 0x8c, 0x40,
0x07, 0x32, 0x39, 0x54, 0xe8, 0x5a, 0x10, 0x7b, 0x09, 0xc2, 0x02, 0x58,
0xb0, 0xeb, 0x23, 0x51, 0x71, 0x1c, 0xc7, 0x71, 0xf0, 0xfd, 0x78, 0x2c,
0xe7, 0xa8, 0x53, 0x7c, 0xdd, 0xf6, 0xa3, 0x2b, 0xa9, 0x51, 0xf4, 0x33,
0x1d, 0x4d, 0x13, 0x0e, 0x53, 0x6b, 0xde, 0x6b, 0x48, 0x46, 0xa0, 0x01,
0xbf, 0x74, 0xf2, 0x14, 0xe5, 0x99, 0x3d, 0x72, 0x37, 0x8e, 0xa9, 0x44,
0x61, 0xed, 0xdd, 0x3b, 0x7c, 0x11, 0x28, 0x12, 0xd5, 0xd6, 0x27, 0x78,
0x4e, 0xf8, 0xe4, 0x3d, 0xdc, 0x5c, 0x92, 0x0c, 0xea, 0x5b, 0xe2, 0x44,
0x71, 0x1c, 0xc7, 0x71, 0x64, 0x55, 0xb2, 0x0d, 0x54, 0x7f, 0x64, 0x72,
0x8e, 0xe1, 0x7b, 0x52, 0xf5, 0xe4, 0x20, 0x13, 0xd1, 0xd4, 0x5d, 0x4c,
0x33, 0x3d, 0xb6, 0x55, 0x26, 0xed, 0xb0, 0x75, 0xa0, 0xf2, 0x72, 0x51,
0x6b, 0xc5, 0x37, 0x23, 0x0d, 0x1d, 0xf5, 0x6f, 0xa6, 0x83, 0x5f, 0x3e,
0x1e, 0xb5, 0x18, 0x23, 0xc8, 0x40, 0xae, 0x63, 0x68, 0x79, 0x8e, 0x56,
0xb0, 0x33, 0x43, 0x08, 0x5b, 0xac, 0x52, 0x39, 0x71, 0x1c, 0xc7, 0x71,
0x9d, 0xf2, 0x00, 0x73, 0xf8, 0x96, 0xbb, 0x43, 0x5b, 0x59, 0xce, 0x07,
0xbb, 0x11, 0xc8, 0x43, 0xde, 0xea, 0xb7, 0x34, 0x51, 0xbf, 0xa7, 0x2d,
0x33, 0x35, 0xc2, 0x40, 0x1c, 0x81, 0x60, 0x63, 0x60, 0x0b, 0xb6, 0x60,
0xbf, 0xb9, 0x38, 0x0c, 0x02, 0x54, 0x53, 0x20, 0xd9, 0xf9, 0xeb, 0x2f,
0x7e, 0x5b, 0xdf, 0x58, 0x4b, 0x99, 0x8e, 0x04, 0x27, 0xb4, 0x18, 0x78,
0xd6, 0x37, 0x16, 0x60, 0x71, 0x1c, 0xc7, 0x71, 0x74, 0x66, 0x66, 0x66,
0xb2, 0xf1, 0x94, 0x20, 0xad, 0x2f, 0xba, 0x68, 0x6a, 0x33, 0xfe, 0x6e,
0xa5, 0x51, 0xec, 0x44, 0xab, 0x05, 0x7e, 0x60, 0x48, 0x6b, 0xa5, 0x56,
0x38, 0x3d, 0xc7, 0x24, 0x99, 0x99, 0x99, 0x59, 0x45, 0x17, 0x5d, 0x74,
0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f
};
unsigned char poseidon_constants_11[] = {
0xb0, 0xf1, 0x1f, 0x2e, 0xf8, 0x8b, 0xb5, 0x07, 0x8d, 0xc4, 0xe1, 0x46,
0x99, 0x23, 0x9f, 0x06, 0xcc, 0x64, 0x13, 0x45, 0x9e, 0xb1, 0xdf, 0x5f,
0xfa, 0x8e, 0x0f, 0x6f, 0x33, 0xd8, 0xfe, 0x19, 0x0a, 0x25, 0x8b, 0x20,
0xe1, 0x2c, 0xcc, 0x36, 0x17, 0x3f, 0x03, 0x05, 0xe1, 0x13, 0xce, 0x35,
0xd4, 0xc9, 0xe7, 0x65, 0x1f, 0x7f, 0x2c, 0x7a, 0x93, 0x9f, 0x34, 0x19,
0x4d, 0x22, 0xf2, 0x7f, 0x8e, 0xa8, 0xb0, 0x51, 0x22, 0x8c, 0x91, 0x30,
0xa5, 0x9c, 0xff, 0x31, 0x0e, 0x04, 0xc9, 0x19, 0x69, 0x60, 0xee, 0x0f,
0xc5, 0xa5, 0xeb, 0x6b, 0xb0, 0xa4, 0xaa, 0x5d, 0x1c, 0x4e, 0xeb, 0x73,
0xec, 0x94, 0xb7, 0x15, 0xce, 0x64, 0x1c, 0x60, 0x3e, 0xa3, 0x6b, 0x4a,
0x87, 0x7a, 0x25, 0x2f, 0xfc, 0xc3, 0x17, 0x20, 0x06, 0xb6, 0x22, 0x7d,
0xca, 0xea, 0x8b, 0x3b, 0xf9, 0xca, 0xa4, 0x32, 0xd2, 0xb7, 0x2e, 0x01,
0x4f, 0x31, 0xc9, 0x2f, 0x10, 0xbf, 0x41, 0x4c, 0xe6, 0xfe, 0xba, 0x49,
0xe5, 0x89, 0xbb, 0x77, 0x7e, 0xe8, 0x83, 0x1c, 0x72, 0xe7, 0x26, 0x58,
0x24, 0x90, 0x9d, 0x1e, 0xb3, 0x20, 0xc8, 0x64, 0x84, 0xa3, 0x21, 0x5d,
0x06, 0x64, 0x30, 0x4b, 0x19, 0x35, 0x96, 0x1e, 0xd1, 0x86, 0x57, 0x4a,
0xb3, 0x8e, 0xd6, 0x7d, 0xaf, 0xd1, 0xde, 0x3f, 0xa2, 0x2c, 0x32, 0x0a,
0xbb, 0xea, 0x4a, 0x46, 0x64, 0x1b, 0x72, 0x14, 0x75, 0x85, 0x1b, 0x4d,
0x11, 0x02, 0x5f, 0x6f, 0x06, 0xdd, 0xd3, 0x6f, 0xbc, 0xcc, 0x77, 0x2e,
0xb7, 0x43, 0xf4, 0x19, 0x9d, 0x2c, 0x4b, 0x2b, 0x0c, 0x41, 0xb9, 0x02,
0xdc, 0x14, 0x5a, 0x67, 0xd4, 0x56, 0xca, 0x45, 0x65, 0xd2, 0x7d, 0x17,
0xcd, 0x91, 0xdd, 0x45, 0xd8, 0xa8, 0xd8, 0x4b, 0xc9, 0x2b, 0xf2, 0x35,
0xc1, 0x81, 0x6c, 0x33, 0xbc, 0xf4, 0x4d, 0x04, 0xfd, 0xb0, 0x91, 0x2b,
0xcf, 0xad, 0x39, 0x45, 0x35, 0xb2, 0xac, 0x2e, 0x2f, 0x13, 0xe3, 0x0b,
0x40, 0x59, 0x33, 0x07, 0xe3, 0xa5, 0xa1, 0x4d, 0x0e, 0x79, 0x05, 0x4c,
0x36, 0x9b, 0xf1, 0x7f, 0x90, 0x50, 0x46, 0x25, 0x87, 0x10, 0x24, 0x3f,
0x52, 0x5d, 0xff, 0x18, 0xad, 0xed, 0x78, 0x52, 0x00, 0x9c, 0xfe, 0x66,
0x22, 0x24, 0xe0, 0x62, 0x13, 0xe2, 0x6f, 0x67, 0xd9, 0xe3, 0x6c, 0x64,
0x6b, 0xa6, 0xea, 0x53, 0x61, 0x56, 0x8a, 0x33, 0x81, 0x35, 0xe5, 0x0f,
0x35, 0xc9, 0xf3, 0x59, 0xc2, 0xa8, 0x92, 0x73, 0x69, 0x66, 0x05, 0x70,
0xa1, 0x5f, 0xec, 0x4e, 0x3d, 0x6b, 0xc0, 0x78, 0xa4, 0xcb, 0xfc, 0x7e,
0x44, 0x8c, 0xc4, 0x1b, 0x25, 0x70, 0x8f, 0x27, 0x87, 0x76, 0x2d, 0x4f,
0x70, 0xb0, 0xea, 0x7a, 0x92, 0x43, 0x8c, 0x00, 0xed, 0xfd, 0x3b, 0x23,
0x69, 0x71, 0x8e, 0x49, 0x83, 0xc3, 0x4e, 0x37, 0xab, 0x18, 0xd9, 0x30,
0x4d, 0x48, 0x5e, 0x7e, 0xbc, 0x5a, 0x1a, 0x24, 0x34, 0xed, 0x19, 0x57,
0xf4, 0xf4, 0x0d, 0x02, 0x0c, 0x57, 0xde, 0x6d, 0x40, 0x39, 0x1f, 0x71,
0x9c, 0xa1, 0xb0, 0x28, 0x2d, 0x05, 0xb9, 0x6b, 0x85, 0x7a, 0x4c, 0x47,
0x55, 0x55, 0x55, 0x35, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11,
0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
0xb6, 0x6d, 0xdb, 0x76, 0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08,
0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78,
0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
0x0f, 0x0f, 0x0f, 0x0f, 0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c,
0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11,
0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79,
0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48,
0xcc, 0xcc, 0xcc, 0x6c, 0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a,
0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32,
0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c,
0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11,
0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b, 0x00, 0x00, 0x00, 0x04,
0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a,
0x70, 0x3d, 0x0a, 0x57, 0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25,
0xdb, 0xb6, 0x6d, 0x3b, 0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48,
0xbd, 0xf7, 0xde, 0x7b, 0x00, 0x00, 0x00, 0x04, 0xc1, 0x07, 0x1f, 0x7c,
0xc8, 0x42, 0x16, 0x32, 0xaa, 0xaa, 0xaa, 0x5a, 0x70, 0x3d, 0x0a, 0x57,
0xec, 0xc4, 0x4e, 0x2c, 0x7b, 0x09, 0xed, 0x25, 0xdb, 0xb6, 0x6d, 0x3b,
0x61, 0xb9, 0xa7, 0x11, 0x88, 0x88, 0x88, 0x48, 0xbd, 0xf7, 0xde, 0x7b,
0x00, 0x00, 0x00, 0x04, 0xc1, 0x07, 0x1f, 0x7c, 0x87, 0x87, 0x87, 0x47,
0x55, 0x55, 0x55, 0x35, 0x7c, 0xec, 0xe8, 0x54, 0x5f, 0xc4, 0x1c, 0x7e,
0x02, 0x38, 0x4e, 0x55, 0x86, 0x80, 0x6d, 0x71, 0xc3, 0xa8, 0x98, 0x4a,
0x2b, 0xaa, 0x86, 0x63, 0x60, 0xd7, 0x4f, 0x2e, 0xb4, 0xac, 0xce, 0x78,
0xbd, 0x1c, 0x4f, 0x55, 0x6b, 0x2c, 0x33, 0x64, 0x8c, 0x56, 0x30, 0x43,
0xd8, 0x89, 0x9d, 0x58, 0xdd, 0x29, 0xc3, 0x15, 0x02, 0x15, 0x5b, 0x4f,
0xdc, 0xb9, 0x0c, 0x03, 0x9a, 0x8d, 0x4d, 0x53, 0x6e, 0xf2, 0x33, 0x15,
0xed, 0x3f, 0x16, 0x06, 0x43, 0xab, 0x59, 0x54, 0x1a, 0x62, 0xcd, 0x3a,
0xda, 0x77, 0xa8, 0x51, 0x42, 0x58, 0x05, 0x55, 0x39, 0xeb, 0xd1, 0x45,
0xb6, 0x6d, 0xdb, 0x76, 0x02, 0x15, 0x5b, 0x4f, 0xb9, 0x5a, 0x8c, 0x36,
0x9a, 0x63, 0x3e, 0x3c, 0xe6, 0x28, 0x72, 0x36, 0x51, 0x89, 0xdb, 0x3b,
0xfa, 0xe0, 0x07, 0x07, 0x30, 0xb3, 0x56, 0x39, 0x91, 0x42, 0x86, 0x38,
0xda, 0xd2, 0x8f, 0x67, 0x75, 0xca, 0x3e, 0x69, 0xe9, 0xd8, 0x07, 0x6f,
0x11, 0x11, 0x11, 0x11, 0xdc, 0xb9, 0x0c, 0x03, 0x9a, 0x63, 0x3e, 0x3c,
0x54, 0xdc, 0x52, 0x1f, 0xf3, 0xc8, 0xb6, 0x6b, 0x96, 0x31, 0xf8, 0x1b,
0x20, 0xee, 0x0b, 0x07, 0x4c, 0x37, 0x80, 0x4b, 0x31, 0x99, 0xd0, 0x09,
0xb8, 0xa5, 0x62, 0x5f, 0xa2, 0x72, 0xfb, 0x33, 0x11, 0xd8, 0x0e, 0x65,
0x00, 0x00, 0x00, 0x08, 0x9a, 0x8d, 0x4d, 0x53, 0xe6, 0x28, 0x72, 0x36,
0xf3, 0xc8, 0xb6, 0x6b, 0xef, 0x80, 0xab, 0x77, 0x4d, 0x49, 0x25, 0x2b,
0x7e, 0x10, 0x08, 0x1b, 0x70, 0x22, 0x72, 0x66, 0x8b, 0xe6, 0x06, 0x3a,
0x58, 0xb9, 0x7e, 0x02, 0x97, 0xf4, 0xc2, 0x4f, 0x6b, 0x9a, 0x68, 0x53,
0x0f, 0x0f, 0x0f, 0x0f, 0x6e, 0xf2, 0x33, 0x15, 0x51, 0x89, 0xdb, 0x3b,
0x96, 0x31, 0xf8, 0x1b, 0x4d, 0x49, 0x25, 0x2b, 0xe2, 0xe0, 0x5c, 0x64,
0xb6, 0x1d, 0x73, 0x13, 0x38, 0x1b, 0xfd, 0x49, 0xe1, 0x2c, 0xce, 0x5d,
0x2a, 0x6b, 0xb4, 0x17, 0x7e, 0xa9, 0x6e, 0x72, 0x2f, 0x77, 0x47, 0x79,
0x38, 0x8e, 0xe3, 0x78, 0xed, 0x3f, 0x16, 0x06, 0xfa, 0xe0, 0x07, 0x07,
0x20, 0xee, 0x0b, 0x07, 0x7e, 0x10, 0x08, 0x1b, 0xb6, 0x1d, 0x73, 0x13,
0xca, 0x4a, 0x44, 0x68, 0x1c, 0x93, 0xbc, 0x37, 0xfa, 0x14, 0x8b, 0x55,
0xae, 0xe0, 0xac, 0x31, 0xcb, 0x04, 0x09, 0x46, 0x27, 0x8f, 0x96, 0x07,
0x28, 0xaf, 0xa1, 0x3c, 0x43, 0xab, 0x59, 0x54, 0x30, 0xb3, 0x56, 0x39,
0x4c, 0x37, 0x80, 0x4b, 0x70, 0x22, 0x72, 0x66, 0x38, 0x1b, 0xfd, 0x49,
0x1c, 0x93, 0xbc, 0x37, 0xfb, 0xdd, 0xff, 0x41, 0x73, 0x22, 0xa8, 0x31,
0xd4, 0xc3, 0x26, 0x2b, 0xe7, 0x8c, 0xce, 0x35, 0x03, 0x29, 0x9c, 0x43,
0xcc, 0xcc, 0xcc, 0x6c, 0x1a, 0x62, 0xcd, 0x3a, 0x91, 0x42, 0x86, 0x38,
0x31, 0x99, 0xd0, 0x09, 0x8b, 0xe6, 0x06, 0x3a, 0xe1, 0x2c, 0xce, 0x5d,
0xfa, 0x14, 0x8b, 0x55, 0x73, 0x22, 0xa8, 0x31, 0xaf, 0x9f, 0x0d, 0x2d,
0xd8, 0xf1, 0xd2, 0x43, 0x41, 0x60, 0x7a, 0x48, 0xca, 0xa1, 0x4c, 0x7c,
0x79, 0x9e, 0xe7, 0x79, 0xda, 0x77, 0xa8, 0x51, 0xda, 0xd2, 0x8f, 0x67,
0xb8, 0xa5, 0x62, 0x5f, 0x58, 0xb9, 0x7e, 0x02, 0x2a, 0x6b, 0xb4, 0x17,
0xae, 0xe0, 0xac, 0x31, 0xd4, 0xc3, 0x26, 0x2b, 0xd8, 0xf1, 0xd2, 0x43,
0x38, 0xc4, 0xc5, 0x55, 0x39, 0x3d, 0x1f, 0x4c, 0x81, 0xa8, 0x99, 0x14,
0xa2, 0x8b, 0x2e, 0x7a, 0x42, 0x58, 0x05, 0x55, 0x75, 0xca, 0x3e, 0x69,
0xa2, 0x72, 0xfb, 0x33, 0x97, 0xf4, 0xc2, 0x4f, 0x7e, 0xa9, 0x6e, 0x72,
0xcb, 0x04, 0x09, 0x46, 0xe7, 0x8c, 0xce, 0x35, 0x41, 0x60, 0x7a, 0x48,
0x39, 0x3d, 0x1f, 0x4c, 0xc3, 0x27, 0xbb, 0x1a, 0x86, 0xb4, 0x97, 0x00,
0xc8, 0x42, 0x16, 0x32, 0x39, 0xeb, 0xd1, 0x45, 0xe9, 0xd8, 0x07, 0x6f,
0x11, 0xd8, 0x0e, 0x65, 0x6b, 0x9a, 0x68, 0x53, 0x2f, 0x77, 0x47, 0x79,
0x27, 0x8f, 0x96, 0x07, 0x03, 0x29, 0x9c, 0x43, 0xca, 0xa1, 0x4c, 0x7c,
0x81, 0xa8, 0x99, 0x14, 0x86, 0xb4, 0x97, 0x00, 0x0c, 0xd8, 0x29, 0x37,
0x55, 0x55, 0x55, 0x35, 0xcc, 0xab, 0xe7, 0x58, 0x82, 0xaa, 0xb7, 0x06,
0x3c, 0x2a, 0x3d, 0x61, 0x45, 0xbd, 0xcc, 0x4b, 0xa9, 0x83, 0x44, 0x56,
0x16, 0xe6, 0x58, 0x6e, 0x70, 0x4b, 0x3a, 0x44, 0xe2, 0x3b, 0x37, 0x60,
0xf0, 0x3b, 0x41, 0x1e, 0x44, 0x40, 0x84, 0x5a, 0x63, 0x5d, 0x4d, 0x78,
0x22, 0x80, 0xb3, 0x0f, 0xe0, 0x85, 0xec, 0x77, 0xe5, 0x3d, 0xda, 0x27,
0x55, 0xf9, 0xfd, 0x44, 0x38, 0xa7, 0x0f, 0x0a, 0x2f, 0xec, 0xda, 0x34,
0x24, 0xef, 0x00, 0x40, 0x54, 0x9a, 0x0b, 0x27, 0xf9, 0x85, 0xf4, 0x16,
0x14, 0x1f, 0x17, 0x30, 0x1d, 0xb0, 0xdf, 0x31, 0x55, 0x55, 0x55, 0x35,
0x98, 0x36, 0x7e, 0x31, 0xd0, 0xda, 0x0a, 0x16, 0xae, 0xb0, 0x6a, 0x00,
0x0e, 0x7a, 0x7e, 0x6d, 0x93, 0x81, 0x4d, 0x21, 0x45, 0x5a, 0x4d, 0x20,
0x42, 0x5d, 0xfd, 0x49, 0x28, 0xc5, 0xe2, 0x75, 0x45, 0x85, 0x03, 0x2c,
0xfc, 0x78, 0x72, 0x15, 0x98, 0x9c, 0x88, 0x0b, 0xed, 0x8f, 0x6f, 0x2b,
0x55, 0x75, 0x17, 0x5f, 0xe5, 0xed, 0x21, 0x52, 0x5a, 0x34, 0x10, 0x7d,
0x42, 0x25, 0x57, 0x6a, 0xa4, 0xb2, 0xe6, 0x2e, 0x05, 0xa8, 0xc4, 0x17,
0xff, 0x9c, 0x7f, 0x6f, 0x23, 0x64, 0x17, 0x44, 0x85, 0xa9, 0x6b, 0x46,
0x66, 0x58, 0x1b, 0x3b, 0x55, 0x55, 0x55, 0x35, 0x55, 0xf6, 0xca, 0x06,
0x68, 0x75, 0xa9, 0x55, 0x54, 0x44, 0x4f, 0x61, 0x65, 0x3b, 0x96, 0x37,
0xa9, 0x89, 0xb6, 0x47, 0x70, 0x8a, 0x8d, 0x74, 0x09, 0x53, 0x9e, 0x5e,
0x92, 0x56, 0x2b, 0x34, 0x3e, 0x9d, 0x12, 0x0a, 0x54, 0x98, 0xf8, 0x29,
0xde, 0xa0, 0xdd, 0x11, 0x46, 0x3e, 0x0f, 0x70, 0xff, 0xee, 0x0d, 0x7c,
0x48, 0xe0, 0xe1, 0x6d, 0xb6, 0x5a, 0x2f, 0x7c, 0xb1, 0xb2, 0xf7, 0x2f,
0xda, 0x64, 0x33, 0x7e, 0x87, 0x48, 0x48, 0x7e, 0x95, 0x6c, 0xd5, 0x5c,
0x26, 0x8f, 0xc9, 0x3e, 0xf9, 0x5e, 0x99, 0x38, 0xf5, 0x32, 0xc2, 0x66,
0x55, 0x55, 0x55, 0x35, 0x7f, 0xb1, 0x0f, 0x47, 0xac, 0x5d, 0xec, 0x76,
0xba, 0x59, 0xc4, 0x7f, 0xfb, 0xdc, 0x32, 0x46, 0xe8, 0x83, 0xe0, 0x0a,
0xf4, 0xb8, 0x56, 0x36, 0x07, 0x4f, 0x7f, 0x29, 0x31, 0xb8, 0xf4, 0x2c,
0x7e, 0x42, 0xbd, 0x3e, 0xf1, 0x9d, 0x40, 0x73, 0x51, 0xf1, 0xce, 0x31,
0x35, 0x7b, 0x0e, 0x48, 0x9e, 0xb9, 0x6e, 0x3b, 0x37, 0x00, 0x57, 0x0c,
0x15, 0x25, 0x74, 0x64, 0xdd, 0x39, 0x64, 0x5c, 0x0a, 0x5d, 0x08, 0x2b,
0xf5, 0xe6, 0x0c, 0x3f, 0xe6, 0xce, 0x30, 0x2d, 0x27, 0xc4, 0x07, 0x19,
0x82, 0xfb, 0x44, 0x08, 0x7b, 0x94, 0x23, 0x69, 0x55, 0x55, 0x55, 0x35,
0xc7, 0xbe, 0xaf, 0x49, 0xa6, 0x9a, 0x26, 0x30, 0x7c, 0xb2, 0x66, 0x35,
0xe4, 0x83, 0x46, 0x62, 0xe3, 0x1c, 0x23, 0x07, 0x36, 0x2e, 0xd3, 0x00,
0xe2, 0x65, 0xc8, 0x51, 0x0c, 0x09, 0x5c, 0x74, 0x13, 0x94, 0xf9, 0x67,
0x4e, 0x07, 0x26, 0x03, 0xba, 0xb4, 0x3a, 0x7f, 0x38, 0xb4, 0x7c, 0x6a,
0x44, 0x7a, 0x1c, 0x7b, 0xeb, 0xf9, 0x8b, 0x0b, 0x16, 0xf8, 0x23, 0x36,
0x7b, 0x89, 0x79, 0x44, 0x80, 0xfe, 0x33, 0x2a, 0x7d, 0x59, 0xe2, 0x1b,
0x7b, 0xe1, 0xb0, 0x15, 0x21, 0xcb, 0x47, 0x77, 0x23, 0x1a, 0xc0, 0x14,
0x5b, 0x86, 0x06, 0x2d, 0x55, 0x55, 0x55, 0x35, 0x04, 0xb5, 0x47, 0x27,
0x1d, 0xb7, 0x22, 0x44, 0xcc, 0x9e, 0xce, 0x7d, 0xf2, 0x75, 0x78, 0x78,
0x7b, 0x98, 0x99, 0x12, 0xbd, 0x34, 0xe4, 0x43, 0xf0, 0x0a, 0x96, 0x43,
0xf1, 0x50, 0x1d, 0x0b, 0x86, 0x78, 0xc9, 0x59, 0xc7, 0x78, 0xec, 0x16,
0x71, 0xaa, 0x0c, 0x56, 0xbf, 0x92, 0xe2, 0x3a, 0xb5, 0x6e, 0x2d, 0x18,
0xe2, 0xc7, 0x31, 0x67, 0x10, 0xab, 0x9f, 0x27, 0x27, 0x1e, 0xf3, 0x69,
0xaf, 0x57, 0x42, 0x4c, 0x4f, 0xb4, 0x30, 0x35, 0x00, 0x54, 0xb0, 0x4a,
0xa2, 0x00, 0x2a, 0x4a, 0x3d, 0x49, 0x58, 0x73, 0xf9, 0x16, 0xb0, 0x01,
0x55, 0x55, 0x55, 0x35, 0xe4, 0xd5, 0x3f, 0x2e, 0xee, 0x84, 0x47, 0x51,
0x3f, 0x84, 0xb9, 0x6b, 0x49, 0xb9, 0xae, 0x57, 0x32, 0x5a, 0x04, 0x02,
0xe1, 0x6a, 0xf1, 0x4b, 0x30, 0x53, 0xf1, 0x05, 0x29, 0x74, 0x75, 0x76,
0x4a, 0x15, 0x5b, 0x5d, 0xe1, 0xaa, 0x15, 0x1b, 0x62, 0xf5, 0xe8, 0x76,
0x03, 0xc1, 0xaa, 0x06, 0x13, 0x59, 0xc8, 0x40, 0x84, 0x49, 0xc8, 0x1f,
0x85, 0x98, 0x55, 0x6b, 0xed, 0x38, 0x45, 0x17, 0xb8, 0xc7, 0xf7, 0x69,
0xc3, 0x87, 0xd0, 0x17, 0x0a, 0x93, 0xb7, 0x35, 0xc2, 0x45, 0x75, 0x34,
0x7a, 0x78, 0xff, 0x51, 0x26, 0xd2, 0x59, 0x13, 0x55, 0x55, 0x55, 0x35,
0x48, 0x38, 0xf7, 0x6e, 0x4f, 0x7d, 0xc7, 0x70, 0x32, 0x5d, 0x5b, 0x7a,
0x85, 0x35, 0x9c, 0x07, 0x40, 0x08, 0x30, 0x5c, 0x64, 0x69, 0x27, 0x7a,
0x07, 0x34, 0x90, 0x6c, 0x6e, 0xa6, 0x8e, 0x70, 0xd4, 0xf2, 0xf7, 0x59,
0x0f, 0x13, 0x17, 0x5d, 0xa8, 0xa9, 0x01, 0x29, 0xad, 0xfd, 0x9a, 0x77,
0x3c, 0x77, 0xc7, 0x67, 0xd0, 0x43, 0xb1, 0x3f, 0x97, 0x76, 0xe4, 0x72,
0xd4, 0x82, 0x9a, 0x25, 0xec, 0xef, 0xc3, 0x03, 0xdc, 0xf9, 0x94, 0x3f,
0xa4, 0x76, 0x88, 0x5a, 0xb8, 0x0f, 0x03, 0x76, 0x58, 0x87, 0x42, 0x11,
0x28, 0xb7, 0xb0, 0x1d, 0x55, 0x55, 0x55, 0x35, 0x2f, 0xe6, 0x44, 0x75,
0xf3, 0x0b, 0xe8, 0x68, 0x59, 0x72, 0x1f, 0x16, 0x8c, 0xd0, 0xe3, 0x3c,
0xcc, 0xfc, 0x77, 0x05, 0xd6, 0x4b, 0x48, 0x78, 0x51, 0x88, 0x4c, 0x5f,
0x30, 0x43, 0x9c, 0x2f, 0x49, 0x72, 0xba, 0x01, 0xba, 0xae, 0xfe, 0x0b,
0x94, 0x3f, 0xe7, 0x71, 0x9d, 0xfa, 0x37, 0x06, 0xfc, 0xa2, 0x99, 0x6f,
0xe2, 0x0d, 0xcf, 0x4b, 0x63, 0x76, 0xec, 0x49, 0xa8, 0xb5, 0x84, 0x0b,
0x84, 0xa3, 0x75, 0x4f, 0x5e, 0x56, 0xdd, 0x37, 0x1a, 0x7d, 0x6e, 0x34,
0x95, 0x39, 0x80, 0x1e, 0x58, 0x2e, 0x22, 0x50, 0xd3, 0x46, 0x93, 0x1e,
0x55, 0x55, 0x55, 0x35, 0xf5, 0x96, 0x5a, 0x5f, 0x9b, 0xc8, 0x58, 0x50,
0x3e, 0x03, 0xab, 0x16, 0xd5, 0xc6, 0x4c, 0x7f, 0x3f, 0x82, 0xf6, 0x34,
0x1c, 0x29, 0x22, 0x16, 0x40, 0xdb, 0xe7, 0x71, 0x8b, 0x8a, 0x4b, 0x55,
0x45, 0xbf, 0xd1, 0x68, 0x4c, 0xbb, 0xe3, 0x43, 0x1b, 0x96, 0x28, 0x3d,
0x36, 0x4f, 0xdb, 0x58, 0xa8, 0x39, 0xac, 0x38, 0xd3, 0xeb, 0x90, 0x18,
0x2f, 0xb7, 0x06, 0x1a, 0x5a, 0x82, 0x53, 0x13, 0x77, 0xaf, 0xe0, 0x4d,
0x9e, 0xe9, 0x39, 0x79, 0xb7, 0xf6, 0xa2, 0x3c, 0x41, 0x9d, 0x14, 0x59,
0x01, 0x33, 0x36, 0x20, 0x15, 0xe0, 0xe4, 0x15, 0x55, 0x55, 0x55, 0x35,
0x58, 0x48, 0x07, 0x36, 0x3f, 0x43, 0x1e, 0x05, 0x33, 0x9e, 0x14, 0x45,
0x69, 0xc8, 0x16, 0x63, 0x5f, 0xab, 0x77, 0x26, 0xf4, 0x08, 0xb0, 0x2e,
0xf8, 0x31, 0x79, 0x29, 0x37, 0xc9, 0x37, 0x28, 0x55, 0x62, 0xcc, 0x43,
0xeb, 0x6b, 0xe4, 0x03, 0xfe, 0x82, 0x50, 0x20, 0x2d, 0xdf, 0xf2, 0x7d,
0xba, 0x07, 0xe2, 0x0e, 0x88, 0x1e, 0x82, 0x2b, 0x87, 0x54, 0x26, 0x39,
0xdd, 0xee, 0x3e, 0x0b, 0xdc, 0xbf, 0x93, 0x1a, 0x8a, 0xce, 0xa6, 0x39,
0x5b, 0xaf, 0x8f, 0x00, 0x7a, 0xad, 0x27, 0x71, 0x1e, 0x76, 0xd8, 0x58,
0x96, 0x36, 0xa3, 0x14, 0x55, 0x55, 0x55, 0x35, 0x76, 0x27, 0x76, 0x62,
0xa4, 0x9f, 0x05, 0x5a, 0x41, 0x28, 0x49, 0x12, 0x24, 0x18, 0x49, 0x12,
0x4f, 0xc2, 0xa5, 0x25, 0x0e, 0x0e, 0x3c, 0x3c, 0x01, 0xa7, 0x65, 0x00,
0x92, 0x9e, 0x17, 0x36, 0xa1, 0x7a, 0x92, 0x27, 0xcf, 0x74, 0xba, 0x4d,
0xcb, 0x6f, 0x66, 0x68, 0xd8, 0x89, 0x9d, 0x58, 0xb6, 0x6d, 0xdb, 0x76,
0x11, 0x11, 0x11, 0x11, 0x00, 0x00, 0x00, 0x08, 0x0f, 0x0f, 0x0f, 0x0f,
0x38, 0x8e, 0xe3, 0x78, 0x28, 0xaf, 0xa1, 0x3c, 0xcc, 0xcc, 0xcc, 0x6c,
0x79, 0x9e, 0xe7, 0x79, 0xa2, 0x8b, 0x2e, 0x7a, 0xc8, 0x42, 0x16, 0x32
};
} // namespace poseidon_constants
#endif

View File

@@ -1,9 +1,13 @@
#include "poseidon/poseidon.cuh"
#pragma once
#ifndef POSEIDON_KERNELS_H
#define POSEIDON_KERNELS_H
#include "gpu-utils/modifiers.cuh"
#include "poseidon/constants.cuh"
namespace poseidon {
template <typename S, int T>
__global__ void prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, bool aligned)
__global__ void prepare_poseidon_states(const S* input, S* states, unsigned int number_of_states, const S domain_tag)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int state_number = idx / T;
@@ -16,27 +20,27 @@ namespace poseidon {
if (element_number == 0) {
prepared_element = domain_tag;
} else {
if (aligned) {
prepared_element = states[idx];
} else {
prepared_element = states[idx - 1];
}
prepared_element = input[idx - state_number - 1];
}
// We need __syncthreads here if the state is not aligned
// because then we need to shift the vector [A, B, 0] -> [D, A, B]
if (!aligned) { __syncthreads(); }
// Store element in state
states[idx] = prepared_element;
}
template <typename S>
DEVICE_INLINE S sbox_alpha_five(S element)
DEVICE_INLINE S sbox_el(S element, const int alpha)
{
S result = S::sqr(element);
result = S::sqr(result);
return result * element;
S result2 = S::sqr(element);
switch (alpha) {
case 3:
return result2 * element;
case 5:
return S::sqr(result2) * element;
case 7:
return S::sqr(result2) * result2 * element;
case 11:
return S::sqr(S::sqr(result2)) * result2 * element;
}
}
template <typename S, int T>
@@ -71,7 +75,7 @@ namespace poseidon {
element = element + constants.round_constants[rc_offset + element_number];
rc_offset += T;
}
element = sbox_alpha_five(element);
element = sbox_el(element, constants.alpha);
if (!skip_rc) { element = element + constants.round_constants[rc_offset + element_number]; }
// Multiply all the states by mds matrix
@@ -111,7 +115,7 @@ namespace poseidon {
__device__ S partial_round(S state[T], size_t rc_offset, int round_number, const PoseidonConstants<S>& constants)
{
S element = state[0];
element = sbox_alpha_five(element);
element = sbox_el(element, constants.alpha);
element = element + constants.round_constants[rc_offset];
S* sparse_matrix = &constants.sparse_matrices[(T * 2 - 1) * round_number];
@@ -155,22 +159,58 @@ namespace poseidon {
}
}
// These function is just doing copy from the states to the output
template <typename S, int T>
__global__ void get_hash_results(S* states, size_t number_of_states, S* out)
__global__ void
squeeze_states_kernel(const S* states, unsigned int number_of_states, unsigned int rate, unsigned int offset, S* out)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
out[idx] = states[idx * T + 1];
for (int i = 0; i < rate; i++) {
out[idx * rate + i] = states[idx * T + offset + i];
}
}
template <typename S, int T>
__global__ void copy_recursive(S* state, size_t number_of_states, S* out)
cudaError_t poseidon_permutation_kernel(
const S* input,
S* out,
unsigned int number_of_states,
unsigned int input_len,
unsigned int output_len,
const PoseidonConstants<S>& constants,
cudaStream_t& stream)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
S* states;
CHK_IF_RETURN(cudaMallocAsync(&states, number_of_states * T * sizeof(S), stream));
state[(idx / (T - 1) * T) + (idx % (T - 1)) + 1] = out[idx];
prepare_poseidon_states<S, T>
<<<PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T), 0, stream>>>(
input, states, number_of_states, constants.domain_tag);
size_t rc_offset = 0;
full_rounds<S, T><<<
PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T),
sizeof(S) * PKC::hashes_per_block(T) * T, stream>>>(
states, number_of_states, rc_offset, FIRST_FULL_ROUNDS, constants);
rc_offset += T * (constants.full_rounds_half + 1);
partial_rounds<S, T><<<PKC::number_of_singlehash_blocks(number_of_states), PKC::singlehash_block_size, 0, stream>>>(
states, number_of_states, rc_offset, constants);
rc_offset += constants.partial_rounds;
full_rounds<S, T><<<
PKC::number_of_full_blocks(T, number_of_states), PKC::number_of_threads(T),
sizeof(S) * PKC::hashes_per_block(T) * T, stream>>>(
states, number_of_states, rc_offset, SECOND_FULL_ROUNDS, constants);
squeeze_states_kernel<S, T>
<<<PKC::number_of_singlehash_blocks(number_of_states), PKC::singlehash_block_size, 0, stream>>>(
states, number_of_states, output_len, 1, out);
CHK_IF_RETURN(cudaFreeAsync(states, stream));
return CHK_LAST();
}
} // namespace poseidon
} // namespace poseidon
#endif

View File

@@ -8,132 +8,87 @@
#include "gpu-utils/error_handler.cuh"
#include "utils/utils.h"
#include "poseidon/kernels.cuh"
#include "poseidon/constants.cuh"
#include "hash/hash.cuh"
using namespace hash;
/**
* @namespace poseidon
* Implementation of the [Poseidon hash function](https://eprint.iacr.org/2019/458.pdf)
* Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
*/
namespace poseidon {
#define FIRST_FULL_ROUNDS true
#define SECOND_FULL_ROUNDS false
/**
* For most of the Poseidon configurations this is the case
* TODO: Add support for different full rounds numbers
*/
const int FULL_ROUNDS_DEFAULT = 4;
/**
* @struct PoseidonConstants
* This constants are enough to define a Poseidon instantce
* @param round_constants A pointer to round constants allocated on the device
* @param mds_matrix A pointer to an mds matrix allocated on the device
* @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
* @param sparse_matrices A pointer to sparse matrices allocated on the device
*/
template <typename S>
struct PoseidonConstants {
int arity;
int partial_rounds;
int full_rounds_half;
S* round_constants = nullptr;
S* mds_matrix = nullptr;
S* non_sparse_matrix = nullptr;
S* sparse_matrices = nullptr;
S domain_tag;
};
/**
* @class PoseidonKernelsConfiguration
* Describes the logic of deriving CUDA kernels parameters
* such as the number of threads and the number of blocks
*/
template <int T>
class PoseidonKernelsConfiguration
class Poseidon : public SpongeHasher<S, S>
{
public:
// The logic behind this is that 1 thread only works on 1 element
// We have {T} elements in each state, and {number_of_states} states total
static const int number_of_threads = 256 / T * T;
const std::size_t device_id;
PoseidonConstants<S> constants;
// The partial rounds operates on the whole state, so we define
// the parallelism params for processing a single hash preimage per thread
static const int singlehash_block_size = 128;
static const int hashes_per_block = number_of_threads / T;
static int number_of_full_blocks(size_t number_of_states)
cudaError_t run_hash_many_kernel(
const S* input,
S* output,
unsigned int number_of_states,
unsigned int input_len,
unsigned int output_len,
const device_context::DeviceContext& ctx) const override
{
int total_number_of_threads = number_of_states * T;
return total_number_of_threads / number_of_threads +
static_cast<bool>(total_number_of_threads % number_of_threads);
cudaError_t permutation_error;
#define P_PERM_T(width) \
case width: \
permutation_error = poseidon_permutation_kernel<S, width>( \
input, output, number_of_states, input_len, output_len, this->constants, ctx.stream); \
break;
switch (this->width) {
P_PERM_T(3)
P_PERM_T(5)
P_PERM_T(9)
P_PERM_T(12)
default:
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [3, 5, 9, 12]");
}
CHK_IF_RETURN(permutation_error);
return CHK_LAST();
}
static int number_of_singlehash_blocks(size_t number_of_states)
Poseidon(
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const S* round_constants,
const S* mds_matrix,
const S* non_sparse_matrix,
const S* sparse_matrices,
const S domain_tag,
device_context::DeviceContext& ctx)
: SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
{
return number_of_states / singlehash_block_size + static_cast<bool>(number_of_states % singlehash_block_size);
PoseidonConstants<S> constants;
CHK_STICKY(create_optimized_poseidon_constants(
arity, alpha, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
domain_tag, &constants, ctx));
this->constants = constants;
}
Poseidon(int arity, device_context::DeviceContext& ctx)
: SpongeHasher<S, S>(arity + 1, arity, arity, 1), device_id(ctx.device_id)
{
PoseidonConstants<S> constants{};
CHK_STICKY(init_optimized_poseidon_constants(arity, ctx, &constants));
this->constants = constants;
}
~Poseidon()
{
auto ctx = device_context::get_default_device_context();
ctx.device_id = this->device_id;
CHK_STICKY(release_optimized_poseidon_constants<S>(&this->constants, ctx));
}
};
template <int T>
using PKC = PoseidonKernelsConfiguration<T>;
/**
* @struct PoseidonConfig
* Struct that encodes various Poseidon parameters.
*/
struct PoseidonConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
bool input_is_a_state; /**< If true, input is considered to be a states vector, holding the preimages
* in aligned or not aligned format. Memory under the input pointer will be used for states
* If false, fresh states memory will be allocated and input will be copied into it */
bool aligned; /**< If true - input should be already aligned for poseidon permutation.
* Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
* not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D) */
bool loop_state; /**< If true, hash results will also be copied in the input pointer in aligned format */
bool is_async; /**< Whether to run the Poseidon asynchronously. If set to `true`, the poseidon_hash function will be
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
* function will block the current CPU thread. */
};
static PoseidonConfig default_poseidon_config(
int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
{
PoseidonConfig config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // input_is_a_state
false, // aligned
false, // loop_state
false, // is_async
};
return config;
}
/**
* Loads pre-calculated optimized constants, moves them to the device
*/
template <typename S>
cudaError_t
init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
/**
* Compute the poseidon hash over a sequence of preimages.
* Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
* @param T size of the poseidon state, should be equal to {arity + 1}
* @param input a pointer to the input data. May be allocated on device or on host, regulated
* by the config. May point to a string of preimages or a string of states filled with preimages.
* @param output a pointer to the output data. May be allocated on device or on host, regulated
* by the config. Must be at least of size [number_of_states](@ref number_of_states)
* @param number_of_states number of input blocks of size T-1 (arity)
*/
template <typename S, int T>
cudaError_t poseidon_hash(
S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config);
} // namespace poseidon
#endif

View File

@@ -1,74 +0,0 @@
#pragma once
#ifndef MERKLE_H
#define MERKLE_H
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "utils/utils.h"
#include "poseidon/poseidon.cuh"
#include <iostream>
#include <math.h>
using namespace poseidon;
/**
* @namespace merkle
* Implementation of the [Poseidon](@ref poseidon) [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree) builder,
* parallelized for the use on GPU
*/
namespace merkle {
static constexpr size_t GIGA = 1024 * 1024 * 1024;
/// Bytes per stream
static constexpr size_t STREAM_CHUNK_SIZE = 1024 * 1024 * 1024;
/**
* @struct TreeBuilderConfig
* Struct that encodes various Tree builder parameters.
*/
struct TreeBuilderConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
int keep_rows; /**< How many rows of the Merkle tree rows should be written to output. '0' means all of them */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool is_async; /**< Whether to run the tree builder asynchronously. If set to `true`, the build_merkle_tree
* function will be non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
* function will block the current CPU thread. */
};
static TreeBuilderConfig
default_merkle_config(const device_context::DeviceContext& ctx = device_context::get_default_device_context())
{
TreeBuilderConfig config = {
ctx, // ctx
0, // keep_rows
false, // are_inputes_on_device
false, // is_async
};
return config;
}
/**
* Builds the Poseidon Merkle tree
*
* @param leaves a pointer to the leaves layer. May be allocated on device or on host, regulated by the config
* Expected to have arity ^ (height - 1) elements
* @param digests a pointer to the digests storage. May only be allocated on the host
* Expected to have `sum(arity ^ (i)) for i in [0..height-1]`
* @param height the height of the merkle tree
* # Algorithm
* The function will split large tree into many subtrees of size that will fit `STREAM_CHUNK_SIZE`.
* Each subtree is build in it's own stream (there is a maximum number of streams)
* After all subtrees are constructed - the function will combine the resulting sub-digests into the final top-tree
*/
template <typename S, int T>
cudaError_t build_merkle_tree(
const S* leaves,
S* digests,
uint32_t height,
const PoseidonConstants<S>& poseidon,
const TreeBuilderConfig& config);
} // namespace merkle
#endif

View File

@@ -0,0 +1,65 @@
#pragma once
#ifndef POSEIDON2_CONSTANTS_H
#define POSEIDON2_CONSTANTS_H
#include "gpu-utils/device_context.cuh"
namespace poseidon2 {
/**
* For most of the Poseidon2 configurations this is the case
*/
const int EXTERNAL_ROUNDS_DEFAULT = 8;
enum DiffusionStrategy {
DEFAULT_DIFFUSION,
MONTGOMERY,
};
enum MdsType { DEFAULT_MDS, PLONKY };
/**
* @struct Poseidon2Constants
* This constants are enough to define a Poseidon2 instantce
* @param round_constants A pointer to round constants allocated on the device
* @param mds_matrix A pointer to an mds matrix allocated on the device
* @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
* @param sparse_matrices A pointer to sparse matrices allocated on the device
*/
template <typename S>
struct Poseidon2Constants {
int width;
int alpha;
int internal_rounds;
int external_rounds;
S* round_constants = nullptr;
S* internal_matrix_diag = nullptr;
MdsType mds_type;
DiffusionStrategy diffusion;
};
template <typename S>
cudaError_t create_poseidon2_constants(
int width,
int alpha,
int internal_rounds,
int external_rounds,
const S* round_constants,
const S* internal_matrix_diag,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* poseidon_constants);
template <typename S>
cudaError_t init_poseidon2_constants(
int width,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* poseidon2_constants);
template <typename S>
cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx);
} // namespace poseidon2
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -3,13 +3,14 @@ from sage.rings.polynomial.polynomial_gf2x import GF2X_BuildIrred_list
from math import *
import itertools
CURVE_NAME = "bn254"
CURVE_NAME = "m31"
###########################################################################
# p = 18446744069414584321 # GoldiLocks
# p = 2013265921 # BabyBear
p = 2**31 - 1 # M31
# p = 52435875175126190479447740508185965837690552500527637822603658699938581184513 # BLS12-381
p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
# p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 # BN254/BN256
# p = 28948022309329048855892746252171976963363056481941560715954676764349967630337 # Pasta (Pallas)
# p = 28948022309329048855892746252171976963363056481941647379679742748393362948097 # Pasta (Vesta)
@@ -617,6 +618,8 @@ print(f"namespace poseidon2_constants_{CURVE_NAME} {{")
for t in TS:
NUM_CELLS = t
R_F_FIXED, R_P_FIXED, _, _ = poseidon_calc_final_numbers_fixed(p, t, alpha, 128, True)
if t == 16:
R_P_FIXED = 14
INIT_SEQUENCE = []

View File

@@ -1,7 +1,28 @@
#include "poseidon/poseidon.cuh"
#pragma once
#ifndef POSEIDON2_KERNELS_H
#define POSEIDON2_KERNELS_H
#include "utils/utils.h"
#include "hash/hash.cuh"
#include "matrix/matrix.cuh"
#include "poseidon2/constants.cuh"
#include "gpu-utils/modifiers.cuh"
using matrix::Matrix;
namespace poseidon2 {
static DEVICE_INLINE unsigned int d_next_pow_of_two(unsigned int v)
{
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
template <typename S>
DEVICE_INLINE S sbox_el(S element, const int alpha)
{
@@ -19,7 +40,7 @@ namespace poseidon2 {
}
template <typename S, int T>
DEVICE_INLINE S sbox(S state[T], const int alpha)
DEVICE_INLINE void sbox(S state[T], const int alpha)
{
for (int i = 0; i < T; i++) {
state[i] = sbox_el(state[i], alpha);
@@ -27,7 +48,7 @@ namespace poseidon2 {
}
template <typename S, int T>
DEVICE_INLINE S add_rc(S state[T], size_t rc_offset, const S* rc)
DEVICE_INLINE void add_rc(S state[T], size_t rc_offset, const S* rc)
{
for (int i = 0; i < T; i++) {
state[i] = state[i] + rc[rc_offset + i];
@@ -35,7 +56,7 @@ namespace poseidon2 {
}
template <typename S>
__device__ S mds_light_4x4(S s[4])
__device__ void mds_light_4x4(S s[4])
{
S t0 = s[0] + s[1];
S t1 = s[2] + s[3];
@@ -56,7 +77,7 @@ namespace poseidon2 {
// [ 3 1 1 2 ].
// https://github.com/Plonky3/Plonky3/blob/main/poseidon2/src/matrix.rs#L36
template <typename S>
__device__ S mds_light_plonky_4x4(S s[4])
__device__ void mds_light_plonky_4x4(S s[4])
{
S t01 = s[0] + s[1];
S t23 = s[2] + s[3];
@@ -70,7 +91,7 @@ namespace poseidon2 {
}
template <typename S, int T>
__device__ S mds_light(S state[T], MdsType mds)
__device__ void mds_light(S state[T], MdsType mds)
{
S sum;
switch (T) {
@@ -123,7 +144,7 @@ namespace poseidon2 {
}
template <typename S, int T>
__device__ S internal_round(S state[T], size_t rc_offset, const Poseidon2Constants<S>& constants)
__device__ void internal_round(S state[T], size_t rc_offset, const Poseidon2Constants<S>& constants)
{
S element = state[0];
element = element + constants.round_constants[rc_offset];
@@ -176,17 +197,8 @@ namespace poseidon2 {
}
template <typename S, int T>
__global__ void poseidon2_permutation_kernel(
const S* states, S* states_out, size_t number_of_states, const Poseidon2Constants<S> constants)
__device__ void permute_state(S state[T], const Poseidon2Constants<S>& constants)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
S state[T];
UNROLL
for (int i = 0; i < T; i++) {
state[i] = states[idx * T + i];
}
unsigned int rn;
mds_light<S, T>(state, constants.mds_type);
@@ -213,6 +225,22 @@ namespace poseidon2 {
mds_light<S, T>(state, constants.mds_type);
rc_offset += T;
}
}
template <typename S, int T>
__global__ void permutation_kernel(
const S* states, S* states_out, unsigned int number_of_states, const Poseidon2Constants<S> constants)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
S state[T];
UNROLL
for (int i = 0; i < T; i++) {
state[i] = states[idx * T + i];
}
permute_state<S, T>(state, constants);
UNROLL
for (int i = 0; i < T; i++) {
@@ -220,13 +248,120 @@ namespace poseidon2 {
}
}
// These function is just doing copy from the states to the output
template <typename S, int T>
__global__ void get_hash_results(const S* states, size_t number_of_states, int index, S* out)
__global__ void hash_many_kernel(
const S* input,
S* output,
uint64_t number_of_states,
unsigned int input_len,
unsigned int output_len,
const Poseidon2Constants<S> constants)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
uint64_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= number_of_states) { return; }
out[idx] = states[idx * T + index];
S state[T] = {0};
UNROLL
for (int i = 0; i < input_len; i++) {
state[i] = input[idx * input_len + i];
}
permute_state<S, T>(state, constants);
UNROLL
for (int i = 0; i < output_len; i++) {
output[idx * output_len + i] = state[i];
}
}
} // namespace poseidon2
template <typename S, int T>
__device__ void absorb_2d_state(
const Matrix<S>* inputs,
S state[T],
unsigned int number_of_inputs,
unsigned int rate,
uint64_t row_idx,
const Poseidon2Constants<S>& constants)
{
unsigned int index = 0;
for (int i = 0; i < number_of_inputs; i++) {
const Matrix<S>* input = inputs + i;
for (int j = 0; j < input->width; j++) {
state[index] = input->values[row_idx * input->width + j];
index++;
if (index == rate) {
permute_state<S, T>(state, constants);
index = 0;
}
}
}
if (index) { permute_state<S, T>(state, constants); }
}
template <typename S, int T>
__global__ void hash_2d_kernel(
const Matrix<S>* inputs,
S* output,
unsigned int number_of_inputs,
unsigned int rate,
unsigned int output_len,
const Poseidon2Constants<S> constants)
{
uint64_t idx = (blockIdx.x * blockDim.x) + threadIdx.x;
if (idx >= inputs[0].height) { return; }
S state[T] = {0};
absorb_2d_state<S, T>(inputs, state, number_of_inputs, rate, idx, constants);
UNROLL
for (int i = 0; i < output_len; i++) {
output[idx * output_len + i] = state[i];
}
}
template <typename S, int T>
__global__ void compress_and_inject_kernel(
const Matrix<S>* matrices_to_inject,
unsigned int number_of_inputs,
const S* prev_layer,
S* next_layer,
unsigned int rate,
unsigned int digest_elements,
const Poseidon2Constants<S> constants)
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
uint64_t number_of_rows = d_next_pow_of_two(matrices_to_inject[0].height);
if (idx >= number_of_rows) { return; }
size_t next_layer_len = matrices_to_inject[0].height;
S state_to_compress[T] = {S::zero()};
for (int i = 0; i < digest_elements * 2; i++) {
state_to_compress[i] = prev_layer[idx * 2 * digest_elements + i];
}
permute_state<S, T>(state_to_compress, constants);
S injected_state[T] = {S::zero()};
if (idx < next_layer_len) {
absorb_2d_state<S, T>(matrices_to_inject, injected_state, number_of_inputs, rate, idx, constants);
for (int i = 0; i < digest_elements; i++) {
injected_state[digest_elements + i] = injected_state[i];
injected_state[i] = state_to_compress[i];
}
} else {
for (int i = 0; i < digest_elements; i++) {
injected_state[i] = state_to_compress[i];
}
}
permute_state<S, T>(injected_state, constants);
for (int i = 0; i < digest_elements; i++) {
next_layer[idx * digest_elements + i] = injected_state[i];
}
}
} // namespace poseidon2
#endif

View File

@@ -8,124 +8,172 @@
#include "gpu-utils/error_handler.cuh"
#include "utils/utils.h"
#include "hash/hash.cuh"
#include "matrix/matrix.cuh"
#include "poseidon2/constants.cuh"
#include "poseidon2/kernels.cuh"
using matrix::Matrix;
/**
* @namespace poseidon2
* Implementation of the [Poseidon2 hash function](https://eprint.iacr.org/2019/458.pdf)
* Specifically, the optimized [Filecoin version](https://spec.filecoin.io/algorithms/crypto/poseidon/)
*/
namespace poseidon2 {
/**
* For most of the Poseidon2 configurations this is the case
*/
const int EXTERNAL_ROUNDS_DEFAULT = 8;
enum DiffusionStrategy {
DEFAULT_DIFFUSION,
MONTGOMERY,
};
enum MdsType { DEFAULT_MDS, PLONKY };
enum PoseidonMode {
COMPRESSION,
PERMUTATION,
};
/**
* @struct Poseidon2Constants
* This constants are enough to define a Poseidon2 instantce
* @param round_constants A pointer to round constants allocated on the device
* @param mds_matrix A pointer to an mds matrix allocated on the device
* @param non_sparse_matrix A pointer to non sparse matrix allocated on the device
* @param sparse_matrices A pointer to sparse matrices allocated on the device
*/
template <typename S>
struct Poseidon2Constants {
int width;
int alpha;
int internal_rounds;
int external_rounds;
S* round_constants = nullptr;
S* internal_matrix_diag = nullptr;
MdsType mds_type;
DiffusionStrategy diffusion;
};
/**
* @struct Poseidon2Config
* Struct that encodes various Poseidon2 parameters.
*/
struct Poseidon2Config {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
bool are_states_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
PoseidonMode mode;
int output_index;
bool
is_async; /**< Whether to run the Poseidon2 asynchronously. If set to `true`, the poseidon_hash function will be
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the poseidon_hash
* function will block the current CPU thread. */
};
static Poseidon2Config default_poseidon2_config(
int t, const device_context::DeviceContext& ctx = device_context::get_default_device_context())
class Poseidon2 : public hash::SpongeHasher<S, S>
{
Poseidon2Config config = {
ctx, // ctx
false, // are_states_on_device
false, // are_outputs_on_device
PoseidonMode::COMPRESSION,
1, // output_index
false, // is_async
};
return config;
}
static const int POSEIDON_BLOCK_SIZE = 32;
template <typename S>
cudaError_t create_poseidon2_constants(
int width,
int alpha,
int internal_rounds,
int external_rounds,
const S* round_constants,
const S* internal_matrix_diag,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* poseidon_constants);
static inline int poseidon_number_of_blocks(size_t number_of_states)
{
return number_of_states / POSEIDON_BLOCK_SIZE + static_cast<bool>(number_of_states % POSEIDON_BLOCK_SIZE);
}
/**
* Loads pre-calculated optimized constants, moves them to the device
*/
template <typename S>
cudaError_t init_poseidon2_constants(
int width,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* constants);
public:
const std::size_t device_id;
Poseidon2Constants<S> constants;
template <typename S>
cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx);
cudaError_t hash_2d(
const Matrix<S>* inputs,
S* output,
unsigned int number_of_inputs,
unsigned int output_len,
uint64_t number_of_rows,
const device_context::DeviceContext& ctx) const override
{
#define P2_HASH_2D_T(width) \
case width: \
hash_2d_kernel<S, width><<<poseidon_number_of_blocks(number_of_rows), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>( \
inputs, output, number_of_inputs, this->rate, output_len, this->constants); \
break;
switch (this->width) {
P2_HASH_2D_T(2)
P2_HASH_2D_T(3)
P2_HASH_2D_T(4)
P2_HASH_2D_T(8)
P2_HASH_2D_T(12)
P2_HASH_2D_T(16)
P2_HASH_2D_T(20)
P2_HASH_2D_T(24)
default:
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "PoseidonAbsorb2d: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
}
CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}
cudaError_t run_hash_many_kernel(
const S* input,
S* output,
unsigned int number_of_states,
unsigned int input_len,
unsigned int output_len,
const device_context::DeviceContext& ctx) const override
{
#define P2_HASH_MANY_T(width) \
case width: \
hash_many_kernel<S, width><<<poseidon_number_of_blocks(number_of_states), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>( \
input, output, number_of_states, input_len, output_len, this->constants); \
break;
switch (this->width) {
P2_HASH_MANY_T(2)
P2_HASH_MANY_T(3)
P2_HASH_MANY_T(4)
P2_HASH_MANY_T(8)
P2_HASH_MANY_T(12)
P2_HASH_MANY_T(16)
P2_HASH_MANY_T(20)
P2_HASH_MANY_T(24)
default:
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
}
CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}
cudaError_t compress_and_inject(
const Matrix<S>* matrices_to_inject,
unsigned int number_of_inputs,
uint64_t number_of_rows,
const S* prev_layer,
S* next_layer,
unsigned int digest_elements,
const device_context::DeviceContext& ctx) const override
{
#define P2_COMPRESS_AND_INJECT_T(width) \
case width: \
compress_and_inject_kernel<S, width> \
<<<poseidon_number_of_blocks(number_of_rows), POSEIDON_BLOCK_SIZE, 0, ctx.stream>>>( \
matrices_to_inject, number_of_inputs, prev_layer, next_layer, this->rate, digest_elements, this->constants); \
break;
switch (this->width) {
P2_COMPRESS_AND_INJECT_T(2)
P2_COMPRESS_AND_INJECT_T(3)
P2_COMPRESS_AND_INJECT_T(4)
P2_COMPRESS_AND_INJECT_T(8)
P2_COMPRESS_AND_INJECT_T(12)
P2_COMPRESS_AND_INJECT_T(16)
P2_COMPRESS_AND_INJECT_T(20)
P2_COMPRESS_AND_INJECT_T(24)
default:
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "PoseidonPermutation: #width must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
}
CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}
Poseidon2(
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const S* round_constants,
const S* internal_matrix_diag,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx)
: hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
{
Poseidon2Constants<S> constants;
CHK_STICKY(create_poseidon2_constants(
width, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type, diffusion, ctx,
&constants));
this->constants = constants;
}
Poseidon2(
unsigned int width,
unsigned int rate,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx)
: hash::SpongeHasher<S, S>(width, width, rate, 0), device_id(ctx.device_id)
{
Poseidon2Constants<S> constants;
CHK_STICKY(init_poseidon2_constants(width, mds_type, diffusion, ctx, &constants));
this->constants = constants;
}
~Poseidon2()
{
auto ctx = device_context::get_default_device_context();
ctx.device_id = this->device_id;
CHK_STICKY(release_poseidon2_constants<S>(&this->constants, ctx));
}
};
/**
* Compute the poseidon hash over a sequence of preimages.
* Takes {number_of_states * (T-1)} elements of input and computes {number_of_states} hash images
* @param T size of the poseidon state, should be equal to {arity + 1}
* @param states a pointer to the input data. May be allocated on device or on host, regulated
* by the config. May point to a string of preimages or a string of states filled with preimages.
* @param output a pointer to the output data. May be allocated on device or on host, regulated
* by the config. Must be at least of size [number_of_states](@ref number_of_states)
* @param number_of_states number of input blocks of size T-1 (arity)
*/
template <typename S, int T>
cudaError_t poseidon2_hash(
const S* states,
S* output,
size_t number_of_states,
const Poseidon2Constants<S>& constants,
const Poseidon2Config& config);
} // namespace poseidon2
#endif

View File

@@ -5,4 +5,15 @@
#define CONCAT_DIRECT(a, b) a##_##b
#define CONCAT_EXPAND(a, b) CONCAT_DIRECT(a, b) // expand a,b before concatenation
static unsigned int next_pow_of_two(unsigned int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
#endif // ICICLE_UTILS_H

View File

@@ -105,12 +105,12 @@ namespace vec_ops {
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
template <typename E>
cudaError_t transpose_batch(
cudaError_t transpose_matrix(
const E* mat_in,
E* mat_out,
uint32_t row_size,
uint32_t column_size,
device_context::DeviceContext& ctx,
const device_context::DeviceContext& ctx,
bool on_device,
bool is_async);

View File

@@ -11,6 +11,9 @@ set(SRC ${CMAKE_SOURCE_DIR}/src)
set(FIELD_SOURCE ${SRC}/fields/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/vec_ops/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/merkle-tree/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/merkle-tree/extern_mmcs.cu)
if(EXT_FIELD)
list(APPEND FIELD_SOURCE ${SRC}/fields/extern_extension.cu)
if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
@@ -27,8 +30,6 @@ set(POLYNOMIAL_SOURCE_FILES
# TODO: impl poseidon for small fields. note that it needs to be defined over the extension field!
if (DEFINED CURVE)
list(APPEND FIELD_SOURCE ${SRC}/poseidon/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/poseidon/poseidon.cu)
list(APPEND FIELD_SOURCE ${SRC}/poseidon/tree/merkle.cu)
endif()
if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_POSEIDON2)

View File

@@ -1,5 +1,5 @@
set(TARGET icicle_hash)
add_library(${TARGET} STATIC keccak/keccak.cu)
add_library(${TARGET} STATIC keccak/extern.cu)
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")

1
icicle/src/hash/keccak/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
test_keccak

View File

@@ -1,2 +1,6 @@
test_keccak: test.cu keccak.cu
nvcc -o test_keccak -I. -I../.. test.cu
nvcc -o test_keccak -I../../../include test.cu
./test_keccak
clear:
rm test_keccak

View File

@@ -0,0 +1,20 @@
#include "utils/utils.h"
#include "gpu-utils/error_handler.cuh"
#include "hash/hash.cuh"
#include "hash/keccak/keccak.cuh"
#include "keccak.cu"
namespace keccak {
extern "C" cudaError_t
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
}
extern "C" cudaError_t
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
}
} // namespace keccak

View File

@@ -1,227 +1,14 @@
#include <cstdint>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "hash/hash.cuh"
#include "hash/keccak/keccak.cuh"
#include "kernels.cu"
using namespace hash;
namespace keccak {
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}
#define THETA( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; \
s01 ^= t0; \
s02 ^= t0; \
s03 ^= t0; \
s04 ^= t0; \
\
s10 ^= t1; \
s11 ^= t1; \
s12 ^= t1; \
s13 ^= t1; \
s14 ^= t1; \
\
s20 ^= t2; \
s21 ^= t2; \
s22 ^= t2; \
s23 ^= t2; \
s24 ^= t2; \
\
s30 ^= t3; \
s31 ^= t3; \
s32 ^= t3; \
s33 ^= t3; \
s34 ^= t3; \
\
s40 ^= t4; \
s41 ^= t4; \
s42 ^= t4; \
s43 ^= t4; \
s44 ^= t4; \
}
#define RHOPI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = ROTL64(s10, (uint64_t)1); \
s10 = ROTL64(s11, (uint64_t)44); \
s11 = ROTL64(s41, (uint64_t)20); \
s41 = ROTL64(s24, (uint64_t)61); \
s24 = ROTL64(s42, (uint64_t)39); \
s42 = ROTL64(s04, (uint64_t)18); \
s04 = ROTL64(s20, (uint64_t)62); \
s20 = ROTL64(s22, (uint64_t)43); \
s22 = ROTL64(s32, (uint64_t)25); \
s32 = ROTL64(s43, (uint64_t)8); \
s43 = ROTL64(s34, (uint64_t)56); \
s34 = ROTL64(s03, (uint64_t)41); \
s03 = ROTL64(s40, (uint64_t)27); \
s40 = ROTL64(s44, (uint64_t)14); \
s44 = ROTL64(s14, (uint64_t)2); \
s14 = ROTL64(s31, (uint64_t)55); \
s31 = ROTL64(s13, (uint64_t)45); \
s13 = ROTL64(s01, (uint64_t)36); \
s01 = ROTL64(s30, (uint64_t)28); \
s30 = ROTL64(s33, (uint64_t)21); \
s33 = ROTL64(s23, (uint64_t)15); \
s23 = ROTL64(s12, (uint64_t)10); \
s12 = ROTL64(s21, (uint64_t)6); \
s21 = ROTL64(s02, (uint64_t)3); \
s02 = t0; \
}
#define KHI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; \
s10 = t1; \
s20 = t2; \
s30 = t3; \
s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; \
s11 = t1; \
s21 = t2; \
s31 = t3; \
s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; \
s12 = t1; \
s22 = t2; \
s32 = t3; \
s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; \
s13 = t1; \
s23 = t2; \
s33 = t3; \
s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; \
s14 = t1; \
s24 = t2; \
s34 = t3; \
s44 = t4; \
}
#define IOTA(element, rc) \
{ \
element ^= rc; \
}
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(uint64_t s[25])
{
uint64_t t0, t1, t2, t3, t4;
for (int i = 0; i < 24; i++) {
THETA(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
RHOPI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
KHI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
IOTA(s[0], RC[i]);
}
}
template <int C, int D>
__global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
{
int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (bid >= number_of_blocks) { return; }
const int r_bits = 1600 - C;
const int r_bytes = r_bits / 8;
const int d_bytes = D / 8;
uint8_t* b_input = input + bid * input_block_size;
uint8_t* b_output = output + bid * d_bytes;
uint64_t state[25] = {}; // Initialize with zeroes
int input_len = input_block_size;
// absorb
while (input_len >= r_bytes) {
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(b_input + i);
}
keccakf(state);
b_input += r_bytes;
input_len -= r_bytes;
}
// last block (if any)
uint8_t last_block[r_bytes];
for (int i = 0; i < input_len; i++) {
last_block[i] = b_input[i];
}
// pad 10*1
last_block[input_len] = 1;
for (int i = 0; i < r_bytes - input_len - 1; i++) {
last_block[input_len + i + 1] = 0;
}
// last bit
last_block[r_bytes - 1] |= 0x80;
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(last_block + i);
}
keccakf(state);
#pragma unroll
for (int i = 0; i < d_bytes; i += 8) {
*(uint64_t*)(b_output + i) = state[i / 8];
}
}
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
@@ -260,16 +47,4 @@ namespace keccak {
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
extern "C" cudaError_t
keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
}
extern "C" cudaError_t
keccak512_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig& config)
{
return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
}
} // namespace keccak

View File

@@ -0,0 +1,233 @@
#pragma once
#ifndef KECCAK_KERNELS_H
#define KECCAK_KERNELS_H
#include <cstdint>
#include "gpu-utils/modifiers.cuh"
namespace keccak {
using u64 = uint64_t;
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}
#define THETA( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; \
s01 ^= t0; \
s02 ^= t0; \
s03 ^= t0; \
s04 ^= t0; \
\
s10 ^= t1; \
s11 ^= t1; \
s12 ^= t1; \
s13 ^= t1; \
s14 ^= t1; \
\
s20 ^= t2; \
s21 ^= t2; \
s22 ^= t2; \
s23 ^= t2; \
s24 ^= t2; \
\
s30 ^= t3; \
s31 ^= t3; \
s32 ^= t3; \
s33 ^= t3; \
s34 ^= t3; \
\
s40 ^= t4; \
s41 ^= t4; \
s42 ^= t4; \
s43 ^= t4; \
s44 ^= t4; \
}
#define RHOPI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = ROTL64(s10, (uint64_t)1); \
s10 = ROTL64(s11, (uint64_t)44); \
s11 = ROTL64(s41, (uint64_t)20); \
s41 = ROTL64(s24, (uint64_t)61); \
s24 = ROTL64(s42, (uint64_t)39); \
s42 = ROTL64(s04, (uint64_t)18); \
s04 = ROTL64(s20, (uint64_t)62); \
s20 = ROTL64(s22, (uint64_t)43); \
s22 = ROTL64(s32, (uint64_t)25); \
s32 = ROTL64(s43, (uint64_t)8); \
s43 = ROTL64(s34, (uint64_t)56); \
s34 = ROTL64(s03, (uint64_t)41); \
s03 = ROTL64(s40, (uint64_t)27); \
s40 = ROTL64(s44, (uint64_t)14); \
s44 = ROTL64(s14, (uint64_t)2); \
s14 = ROTL64(s31, (uint64_t)55); \
s31 = ROTL64(s13, (uint64_t)45); \
s13 = ROTL64(s01, (uint64_t)36); \
s01 = ROTL64(s30, (uint64_t)28); \
s30 = ROTL64(s33, (uint64_t)21); \
s33 = ROTL64(s23, (uint64_t)15); \
s23 = ROTL64(s12, (uint64_t)10); \
s12 = ROTL64(s21, (uint64_t)6); \
s21 = ROTL64(s02, (uint64_t)3); \
s02 = t0; \
}
#define KHI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; \
s10 = t1; \
s20 = t2; \
s30 = t3; \
s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; \
s11 = t1; \
s21 = t2; \
s31 = t3; \
s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; \
s12 = t1; \
s22 = t2; \
s32 = t3; \
s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; \
s13 = t1; \
s23 = t2; \
s33 = t3; \
s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; \
s14 = t1; \
s24 = t2; \
s34 = t3; \
s44 = t4; \
}
#define IOTA(element, rc) \
{ \
element ^= rc; \
}
__device__ const u64 RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(u64 s[25])
{
u64 t0, t1, t2, t3, t4;
for (int i = 0; i < 24; i++) {
THETA(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
RHOPI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
KHI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
IOTA(s[0], RC[i]);
}
}
template <int C, int D>
__global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
{
int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (bid >= number_of_blocks) { return; }
const int r_bits = 1600 - C;
const int r_bytes = r_bits / 8;
const int d_bytes = D / 8;
uint8_t* b_input = input + bid * input_block_size;
uint8_t* b_output = output + bid * d_bytes;
uint64_t state[25] = {}; // Initialize with zeroes
int input_len = input_block_size;
// absorb
while (input_len >= r_bytes) {
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(b_input + i);
}
keccakf(state);
b_input += r_bytes;
input_len -= r_bytes;
}
// last block (if any)
uint8_t last_block[r_bytes];
for (int i = 0; i < input_len; i++) {
last_block[i] = b_input[i];
}
// pad 10*1
last_block[input_len] = 1;
for (int i = 0; i < r_bytes - input_len - 1; i++) {
last_block[input_len + i + 1] = 0;
}
// last bit
last_block[r_bytes - 1] |= 0x80;
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(last_block + i);
}
keccakf(state);
#pragma unroll
for (int i = 0; i < d_bytes; i += 8) {
*(uint64_t*)(b_output + i) = state[i / 8];
}
}
} // namespace keccak
#endif

View File

@@ -1,5 +1,5 @@
#include "gpu-utils/device_context.cuh"
#include "keccak.cu"
#include "extern.cu"
// #define DEBUG

Binary file not shown.

View File

@@ -0,0 +1,25 @@
#include "utils/utils.h"
#include "gpu-utils/error_handler.cuh"
#include "merkle-tree/merkle.cuh"
#include "merkle.cu"
#include "hash/hash.cuh"
#include "fields/field_config.cuh"
using namespace field_config;
namespace merkle_tree {
extern "C" cudaError_t CONCAT_EXPAND(FIELD, build_merkle_tree)(
const scalar_t* leaves_digests,
scalar_t* digests,
unsigned int height,
unsigned int input_block_len,
const hash::SpongeHasher<scalar_t, scalar_t>* compression,
const hash::SpongeHasher<scalar_t, scalar_t>* bottom_layer,
const TreeBuilderConfig& tree_config)
{
return build_merkle_tree<scalar_t, scalar_t>(
leaves_digests, digests, height, input_block_len, *compression, *bottom_layer, tree_config);
}
} // namespace merkle_tree

View File

@@ -0,0 +1,26 @@
#include "utils/utils.h"
#include "gpu-utils/error_handler.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "mmcs.cu"
#include "hash/hash.cuh"
#include "fields/field_config.cuh"
using namespace field_config;
using matrix::Matrix;
namespace merkle_tree {
extern "C" cudaError_t CONCAT_EXPAND(FIELD, mmcs_commit_cuda)(
const Matrix<scalar_t>* leaves,
unsigned int number_of_inputs,
scalar_t* digests,
const hash::SpongeHasher<scalar_t, scalar_t>* hasher,
const hash::SpongeHasher<scalar_t, scalar_t>* compression,
const TreeBuilderConfig& tree_config)
{
return mmcs_commit<scalar_t, scalar_t>(leaves, number_of_inputs, digests, *hasher, *compression, tree_config);
}
} // namespace merkle_tree

View File

@@ -0,0 +1,336 @@
#include "hash/hash.cuh"
#include "merkle-tree/merkle.cuh"
namespace merkle_tree {
/// Constructs merkle subtree without parallelization
/// The digests are aligned sequentially per row
/// Example:
///
/// Big tree:
///
/// 1 <- Root
/// / \ <- Arity = 2
/// 2 3 <- Digests
/// / \ / \ <- Height = 2 (as the number of edges)
/// 4 5 6 7 <- height^arity leaves
/// | | | | <- Bottom layer hash 1 to 1
/// a b c d <- Input vector 1x4
///
/// Subtree 1 Subtree 2
/// 2 3
/// / \ / \
/// 4 5 6 7
///
/// Digests array for subtree 1:
/// [4 5 . . 2 . .]
/// | | |
/// ----- V
/// | Segment (offset = 4, subtree_idx = 0)
/// v
/// Segment (offset = 0, subtree_idx = 0)
///
/// Digests array for subtree 2:
/// [. . 6 7 . 3 .]
/// | |
/// -----
/// |
/// v
/// Segment (offset = 0, subtree_idx = 1)
///
/// Total digests array:
/// [4 5 6 7 2 3 .]
///
/// Example for custom config:
///
/// arity = 2
/// input_block_len = 2
/// digest_elements = 2
/// bottom_layer hash width = 4
/// compression width = 4
/// height = 2
///
/// [a, b] <- Root of the tree
/// | |
/// [a, b, c, d]
/// / \ / \
/// [i, j, m, n]
/// ┌──┬──────┴──┴──┴──┴──────┬──┐
/// | | | |
/// [i, j, k, l] [m, n, o, p] <- compression states
/// / \ / \ / \ / \ <- Running permutation
/// [1, 2, 5, 6] [9, 1, 4, 5] <- compression states
/// ┌──┬───┴──┴──┼──┤ ┌──┬───┴──┴──┼──┤
/// | | | | | | | | <- digest_element * height^arity leaves
/// [1, 2, 3, 4] [5, 6, 7, 8] [9, 1, 2, 3] [4, 5, 6, 7] <- Permuted states
/// / \ / \ / \ / \ / \ / \ / \ / \ <- Running permutation
/// [a, b, 0, 0] [c, d, 0, 0] [e, f, 0, 0] [g, h, 0, 0] <- States of the bottom layer hash
/// | | | | | | | | <- Bottom layer hash 2 to 2
/// a b c d e f g h <- Input vector 2x4
///
/// Input matrix:
/// ┌ ┐
/// | a b |
/// | c d |
/// | e f |
/// | g h |
/// └ ┘
template <typename L, typename D>
cudaError_t build_merkle_subtree(
const L* leaves,
D* states,
D* digests,
size_t subtree_idx,
size_t subtree_height,
L* big_tree_digests,
size_t start_segment_size,
size_t start_segment_offset,
uint64_t keep_rows,
uint64_t input_block_len,
const SpongeHasher<L, D>& bottom_layer,
const SpongeHasher<L, D>& compression,
const TreeBuilderConfig& tree_config,
device_context::DeviceContext& ctx)
{
uint64_t arity = tree_config.arity;
SpongeConfig sponge_config = default_sponge_config(ctx);
sponge_config.are_inputs_on_device = true;
sponge_config.are_outputs_on_device = true;
sponge_config.is_async = true;
size_t bottom_layer_states = pow(arity, subtree_height);
if (!tree_config.are_inputs_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(
states, leaves, bottom_layer_states * input_block_len * sizeof(L), cudaMemcpyHostToDevice, ctx.stream));
}
bottom_layer.hash_many(
tree_config.are_inputs_on_device ? leaves : states, digests, bottom_layer_states, input_block_len,
tree_config.digest_elements, sponge_config);
uint64_t number_of_states = bottom_layer_states / arity;
size_t segment_size = start_segment_size;
size_t segment_offset = start_segment_offset;
if (!keep_rows || subtree_height < keep_rows) {
D* digests_with_offset = big_tree_digests + segment_offset + subtree_idx * bottom_layer_states;
CHK_IF_RETURN(cudaMemcpyAsync(
digests_with_offset, digests, bottom_layer_states * tree_config.digest_elements * sizeof(D),
cudaMemcpyDeviceToHost, ctx.stream));
segment_offset += segment_size;
}
segment_size /= arity;
subtree_height--;
swap<D>(&digests, &states);
while (number_of_states > 0) {
CHK_IF_RETURN(
compression.compress_many(states, digests, number_of_states, tree_config.digest_elements, sponge_config));
if (!keep_rows || subtree_height < keep_rows) {
D* digests_with_offset =
big_tree_digests + segment_offset + subtree_idx * number_of_states * tree_config.digest_elements;
CHK_IF_RETURN(cudaMemcpyAsync(
digests_with_offset, digests, number_of_states * tree_config.digest_elements * sizeof(D),
cudaMemcpyDeviceToHost, ctx.stream));
segment_offset += segment_size;
}
if (number_of_states > 1) { swap<D>(&digests, &states); }
segment_size /= arity;
subtree_height--;
number_of_states /= arity;
}
return CHK_LAST();
}
template <typename L, typename D>
cudaError_t build_merkle_tree(
const L* leaves,
D* digests,
unsigned int height,
unsigned int input_block_len,
const SpongeHasher<L, D>& compression,
const SpongeHasher<L, D>& bottom_layer,
const TreeBuilderConfig& tree_config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = tree_config.ctx.stream;
if (input_block_len * sizeof(L) > bottom_layer.rate * sizeof(D))
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument,
"Sponge construction at the bottom of the tree doesn't support inputs bigger than hash rate");
if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument,
"Hash max preimage length does not match merkle tree arity multiplied by digest elements");
uint64_t number_of_bottom_layer_states = pow(tree_config.arity, height);
// This will determine how much splitting do we need to do
// `number_of_streams` subtrees should fit in the device
// This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
uint64_t number_of_subtrees = 1;
uint64_t subtree_height = height;
uint64_t subtree_bottom_layer_states = number_of_bottom_layer_states;
uint64_t subtree_states_size = subtree_bottom_layer_states * bottom_layer.width;
uint64_t subtree_digests_size;
if (compression.width != compression.preimage_max_length) {
// In that case, the states on layer 1 will require extending the states by (width / preimage_max_len) factor
subtree_digests_size =
subtree_states_size * bottom_layer.preimage_max_length / bottom_layer.width * tree_config.digest_elements;
} else {
subtree_digests_size = subtree_states_size / bottom_layer.width * tree_config.digest_elements;
}
size_t subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
while (subtree_memory_required > STREAM_CHUNK_SIZE) {
number_of_subtrees *= tree_config.arity;
subtree_height--;
subtree_bottom_layer_states /= tree_config.arity;
subtree_states_size /= tree_config.arity;
subtree_digests_size /= tree_config.arity;
subtree_memory_required = sizeof(D) * (subtree_states_size + subtree_digests_size);
}
int cap_height = height - subtree_height;
size_t caps_len = pow(tree_config.arity, cap_height) * tree_config.digest_elements;
size_t available_memory, _total_memory;
CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
available_memory -= GIGA / 8; // Leave 128 MB just in case
// We can effectively parallelize memory copy with streams
// as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
const size_t number_of_streams = std::min((uint64_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
cudaStream_t* streams = static_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t) * number_of_streams));
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
}
bool caps_mode = tree_config.keep_rows && tree_config.keep_rows <= cap_height;
D* caps;
if (caps_mode) { caps = static_cast<D*>(malloc(caps_len * sizeof(D))); }
#ifdef MERKLE_DEBUG
std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
std::cout << "Number of streams = " << number_of_streams << std::endl;
std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
std::cout << "Height of a subtree = " << subtree_height << std::endl;
std::cout << "Cutoff height = " << height - subtree_height << std::endl;
std::cout << "Number of leaves in a subtree = " << subtree_bottom_layer_states << std::endl;
std::cout << "State of a subtree = " << subtree_states_size << std::endl;
std::cout << "Digest elements for a subtree = " << subtree_digests_size << std::endl;
std::cout << "Size of 1 subtree states = " << subtree_states_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
std::cout << "Cap height = " << cap_height << std::endl;
std::cout << "Enabling caps mode? " << caps_mode << std::endl;
#endif
// Allocate memory for the leaves and digests
// These are shared by streams in a pool
D *states_ptr, *digests_ptr;
CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_states_size * number_of_streams * sizeof(D), stream));
CHK_IF_RETURN(cudaMemsetAsync(states_ptr, 0, subtree_states_size * number_of_streams * sizeof(D), stream));
CHK_IF_RETURN(cudaMallocAsync(&digests_ptr, subtree_digests_size * number_of_streams * sizeof(D), stream));
// Wait for these allocations to finish
CHK_IF_RETURN(cudaStreamSynchronize(stream));
// Build subtrees in parallel. This for loop invokes kernels that can run in a pool of size `number_of_streams`
for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
size_t stream_idx = subtree_idx % number_of_streams;
cudaStream_t subtree_stream = streams[stream_idx];
const L* subtree_leaves = leaves + subtree_idx * subtree_bottom_layer_states * input_block_len;
D* subtree_state = states_ptr + stream_idx * subtree_states_size;
D* subtree_digests = digests_ptr + stream_idx * subtree_digests_size;
int subtree_keep_rows = 0;
if (tree_config.keep_rows) {
int diff = tree_config.keep_rows - cap_height;
subtree_keep_rows = std::max(1, diff);
}
device_context::DeviceContext subtree_context{subtree_stream, tree_config.ctx.device_id, tree_config.ctx.mempool};
uint64_t start_segment_size = number_of_bottom_layer_states * tree_config.digest_elements;
cudaError_t subtree_result = build_merkle_subtree<L, D>(
subtree_leaves, // leaves
subtree_state, // state
subtree_digests, // digests
subtree_idx, // subtree_idx
subtree_height, // subtree_height
caps_mode ? caps : digests, // big_tree_digests
start_segment_size, // start_segment_size
0, // start_segment_offset
subtree_keep_rows, // keep_rows
input_block_len, // input_block_len
bottom_layer, // bottom_layer
compression, // compression
tree_config, // tree_config
subtree_context // subtree_context
);
CHK_IF_RETURN(subtree_result);
}
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
}
SpongeConfig sponge_config = default_sponge_config(tree_config.ctx);
sponge_config.are_inputs_on_device = tree_config.are_inputs_on_device;
sponge_config.are_outputs_on_device = true;
sponge_config.is_async = true;
// Finish the top-level tree if any
if (cap_height > 0) {
size_t start_segment_size = caps_len / tree_config.arity;
size_t start_segment_offset = 0;
if (!caps_mode) { // Calculate offset
size_t keep_rows = tree_config.keep_rows ? tree_config.keep_rows : height + 1;
size_t layer_size = pow(tree_config.arity, keep_rows - 1) * tree_config.digest_elements;
for (int i = 0; i < keep_rows - cap_height; i++) {
start_segment_offset += layer_size;
layer_size /= tree_config.arity;
}
}
CHK_IF_RETURN(cudaMemcpyAsync(
states_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
(caps_mode || !tree_config.are_outputs_on_device) ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, stream));
uint64_t number_of_states = caps_len / tree_config.arity / tree_config.digest_elements;
size_t segment_size = start_segment_size;
size_t segment_offset = start_segment_offset;
while (number_of_states > 0) {
CHK_IF_RETURN(compression.compress_many(
states_ptr, digests_ptr, number_of_states, tree_config.digest_elements, sponge_config));
if (!tree_config.keep_rows || cap_height < tree_config.keep_rows + (int)caps_mode) {
D* digests_with_offset = digests + segment_offset;
CHK_IF_RETURN(cudaMemcpyAsync(
digests_with_offset, digests_ptr, number_of_states * tree_config.digest_elements * sizeof(D),
cudaMemcpyDeviceToHost, stream));
segment_offset += segment_size;
}
if (number_of_states > 1) { swap<D>(&digests_ptr, &states_ptr); }
segment_size /= tree_config.arity;
cap_height--;
number_of_states /= tree_config.arity;
}
if (caps_mode) { free(caps); }
}
CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
CHK_IF_RETURN(cudaFreeAsync(digests_ptr, stream));
if (!tree_config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
}
free(streams);
return CHK_LAST();
}
} // namespace merkle_tree

View File

@@ -0,0 +1,456 @@
#include "hash/hash.cuh"
#include "merkle-tree/merkle.cuh"
#include "matrix/matrix.cuh"
#include "vec_ops/vec_ops.cuh"
#include <algorithm>
using matrix::Matrix;
namespace merkle_tree {
template <typename L, typename D>
cudaError_t hash_leaves(
const Matrix<L>* leaves,
unsigned int number_of_inputs,
uint64_t number_of_rows,
D* digests,
unsigned int digest_elements,
const SpongeHasher<L, D>& hasher,
const device_context::DeviceContext& ctx)
{
SpongeConfig sponge_config = default_sponge_config(ctx);
sponge_config.are_inputs_on_device = true;
sponge_config.are_outputs_on_device = true;
sponge_config.is_async = true;
uint64_t number_of_rows_padded = next_pow_of_two(number_of_rows);
CHK_IF_RETURN(hasher.hash_2d(leaves, digests, number_of_inputs, digest_elements, number_of_rows, ctx));
if (number_of_rows_padded - number_of_rows) {
// Pad with default digests
cudaMemsetAsync(
(void*)(digests + number_of_rows), 0, (number_of_rows_padded - number_of_rows) * digest_elements * sizeof(D),
ctx.stream);
}
return CHK_LAST();
}
template <typename L, typename D>
struct SubtreeParams {
unsigned int number_of_inputs; // Number of input matrices
unsigned int arity; // Arity of the tree
unsigned int digest_elements; // Number of output elements per hash
size_t number_of_rows; // Current number of input rows to operate on
size_t number_of_rows_padded; // next power of arity for number_of_rows
size_t subtree_idx; // The subtree id
size_t number_of_subtrees; // Total number of subtrees
uint64_t subtree_height; // Height of one subtree
/// One segment corresponds to one layer of output digests
size_t segment_size; // The size of current segment.
size_t segment_offset; // An offset for the current segment
unsigned int leaves_offset; // An offset in the sorted list of input matrices
unsigned int number_of_leaves_to_inject; // Number of leaves to inject in current level
unsigned int keep_rows; // Number of rows to keep
bool are_inputs_on_device;
bool caps_mode;
const SpongeHasher<L, D>* hasher = nullptr;
const SpongeHasher<L, D>* compression = nullptr;
const device_context::DeviceContext* ctx = nullptr;
};
template <typename L, typename D>
cudaError_t slice_and_copy_leaves(
const std::vector<Matrix<L>>& leaves, L* d_leaves, Matrix<L>* d_leaves_info, SubtreeParams<L, D>& params)
{
uint64_t target_height = params.number_of_rows_padded * params.number_of_subtrees;
params.number_of_leaves_to_inject = 0;
while (params.leaves_offset < params.number_of_inputs &&
next_pow_of_two(leaves[params.leaves_offset].height) >= target_height) {
if (next_pow_of_two(leaves[params.leaves_offset].height) == target_height) params.number_of_leaves_to_inject++;
params.leaves_offset++;
}
if (params.number_of_leaves_to_inject) {
size_t rows_offset = params.subtree_idx * params.number_of_rows_padded;
size_t actual_layer_rows = leaves[params.leaves_offset - params.number_of_leaves_to_inject].height;
params.number_of_rows = std::min(actual_layer_rows - rows_offset, params.number_of_rows_padded);
Matrix<L>* leaves_info = static_cast<Matrix<L>*>(malloc(params.number_of_leaves_to_inject * sizeof(Matrix<L>)));
L* d_leaves_ptr = d_leaves;
for (auto i = 0; i < params.number_of_leaves_to_inject; i++) {
Matrix<L> leaf = leaves[params.leaves_offset - params.number_of_leaves_to_inject + i];
if (!params.are_inputs_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(
d_leaves_ptr, leaf.values + rows_offset * leaf.width, params.number_of_rows * leaf.width * sizeof(L),
cudaMemcpyHostToDevice, params.ctx->stream));
} else {
d_leaves_ptr = leaf.values + rows_offset * leaf.width;
}
leaves_info[i] = {d_leaves_ptr, leaf.width, params.number_of_rows};
d_leaves_ptr += params.number_of_rows * leaf.width;
}
CHK_IF_RETURN(cudaMemcpyAsync(
d_leaves_info, leaves_info, params.number_of_leaves_to_inject * sizeof(Matrix<L>), cudaMemcpyHostToDevice,
params.ctx->stream));
free(leaves_info);
}
return CHK_LAST();
}
/// Checks if the current row needs to be copied out to the resulting digests array
/// Computes the needed offsets using segments model
template <typename L, typename D>
cudaError_t maybe_copy_digests(D* digests, L* big_tree_digests, SubtreeParams<L, D>& params)
{
if (!params.keep_rows || params.subtree_height < params.keep_rows + (int)params.caps_mode) {
D* digests_with_offset = big_tree_digests + params.segment_offset +
params.subtree_idx * params.number_of_rows_padded * params.digest_elements;
CHK_IF_RETURN(cudaMemcpyAsync(
digests_with_offset, digests, params.number_of_rows_padded * params.digest_elements * sizeof(D),
cudaMemcpyDeviceToHost, params.ctx->stream));
params.segment_offset += params.segment_size;
}
return CHK_LAST();
}
template <typename L, typename D>
cudaError_t fold_layer(
const std::vector<Matrix<L>>& leaves,
D* prev_layer,
D* next_layer,
L* aux_leaves_mem,
Matrix<L>* d_leaves_info,
SubtreeParams<L, D>& params)
{
CHK_IF_RETURN(slice_and_copy_leaves<L>(leaves, aux_leaves_mem, d_leaves_info, params));
if (params.number_of_leaves_to_inject) {
CHK_IF_RETURN(params.compression->compress_and_inject(
d_leaves_info, params.number_of_leaves_to_inject, params.number_of_rows, prev_layer, next_layer,
params.digest_elements, *params.ctx));
} else {
CHK_IF_RETURN(params.compression->run_hash_many_kernel(
prev_layer, next_layer, params.number_of_rows_padded, params.compression->width, params.digest_elements,
*params.ctx));
}
return CHK_LAST();
}
template <typename L, typename D>
cudaError_t build_mmcs_subtree(
const std::vector<Matrix<L>>& leaves,
L* d_leaves,
D* states,
L* aux_leaves_mem,
L* big_tree_digests,
SubtreeParams<L, D>& params)
{
// Leaves info
Matrix<L>* d_leaves_info;
CHK_IF_RETURN(cudaMallocAsync(&d_leaves_info, params.number_of_inputs * sizeof(Matrix<L>), params.ctx->stream));
CHK_IF_RETURN(slice_and_copy_leaves(leaves, d_leaves, d_leaves_info, params));
// Reuse leaves memory
D* digests = (D*)d_leaves;
CHK_IF_RETURN(hash_leaves(
d_leaves_info, params.number_of_leaves_to_inject, params.number_of_rows, states, params.digest_elements,
*params.hasher, *params.ctx));
CHK_IF_RETURN(maybe_copy_digests(digests, big_tree_digests, params));
params.number_of_rows_padded /= params.arity;
params.segment_size /= params.arity;
params.subtree_height--;
D* prev_layer = states;
D* next_layer = digests;
while (params.number_of_rows_padded > 0) {
CHK_IF_RETURN(fold_layer(leaves, prev_layer, next_layer, aux_leaves_mem, d_leaves_info, params));
CHK_IF_RETURN(maybe_copy_digests(next_layer, big_tree_digests, params));
swap<D>(&prev_layer, &next_layer);
params.segment_size /= params.arity;
params.subtree_height--;
params.number_of_rows_padded /= params.arity;
}
return CHK_LAST();
}
template <typename L, typename D>
cudaError_t mmcs_commit(
const Matrix<L>* inputs,
const unsigned int number_of_inputs,
D* digests,
const SpongeHasher<L, D>& hasher,
const SpongeHasher<L, D>& compression,
const TreeBuilderConfig& tree_config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = tree_config.ctx.stream;
if (number_of_inputs == 0) THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "No matrices provided");
if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument,
"Hash max preimage length does not match merkle tree arity multiplied by digest elements");
std::vector<Matrix<L>> sorted_inputs(number_of_inputs);
std::partial_sort_copy(
inputs, inputs + number_of_inputs, sorted_inputs.begin(), sorted_inputs.end(),
[](const Matrix<L>& left, const Matrix<L>& right) { return left.height > right.height; });
// Check that the height of any two given matrices either rounds up
// to the same next power of two or otherwise equal
for (unsigned int i = 0; i < number_of_inputs - 1; i++) {
unsigned int left = sorted_inputs[i].height;
unsigned int right = sorted_inputs[i + 1].height;
if (next_pow_of_two(left) == next_pow_of_two(right) && left != right)
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "Matrix heights that round up to the same power of two must be equal");
}
uint64_t max_height = sorted_inputs[0].height;
// Calculate maximum additional memory needed for injected matrices
uint64_t max_aux_total_elements = 0;
uint64_t current_aux_total_elements = 0;
uint64_t current_height = 0;
uint64_t bottom_layer_leaves_elements = 0;
if (!tree_config.are_inputs_on_device) {
for (auto it = sorted_inputs.begin(); it < sorted_inputs.end(); it++) {
if (it->height == max_height) {
bottom_layer_leaves_elements += it->height * it->width;
continue;
}
if (it->height != current_height) {
current_height = it->height;
current_aux_total_elements = 0;
}
current_aux_total_elements += it->width * it->height;
if (current_aux_total_elements > max_aux_total_elements) {
max_aux_total_elements = current_aux_total_elements;
}
}
}
uint64_t number_of_bottom_layer_rows = next_pow_of_two(max_height);
size_t leaves_info_memory = number_of_inputs * sizeof(Matrix<L>);
unsigned int tree_height = get_height(number_of_bottom_layer_rows);
// This will determine how much splitting do we need to do
// `number_of_streams` subtrees should fit in the device
// This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
uint64_t number_of_subtrees = 1;
uint64_t subtree_height = tree_height;
uint64_t subtree_bottom_layer_rows = number_of_bottom_layer_rows;
uint64_t subtree_states_size = subtree_bottom_layer_rows * hasher.width;
uint64_t subtree_digests_size = subtree_bottom_layer_rows * tree_config.digest_elements;
uint64_t subtree_leaves_elements = 0;
for (int i = 0; i < number_of_inputs && sorted_inputs[i].height == max_height; i++) {
subtree_leaves_elements += sorted_inputs[i].width * sorted_inputs[i].height;
}
uint64_t subtree_aux_elements = max_aux_total_elements;
size_t subtree_leaves_memory = std::max(subtree_leaves_elements * sizeof(L), subtree_digests_size * sizeof(D));
size_t subtree_memory_required =
sizeof(D) * subtree_states_size + subtree_leaves_memory + subtree_aux_elements * sizeof(L) + leaves_info_memory;
while (subtree_memory_required > STREAM_CHUNK_SIZE) {
number_of_subtrees *= tree_config.arity;
subtree_height--;
subtree_bottom_layer_rows /= tree_config.arity;
subtree_states_size /= tree_config.arity;
subtree_digests_size /= tree_config.arity;
subtree_leaves_elements /= tree_config.arity;
subtree_aux_elements /= tree_config.arity;
subtree_leaves_memory = std::max(subtree_leaves_elements * sizeof(L), subtree_digests_size * sizeof(D));
subtree_memory_required =
sizeof(D) * subtree_states_size + subtree_leaves_memory + subtree_aux_elements * sizeof(L) + leaves_info_memory;
}
unsigned int cap_height = tree_height - subtree_height;
size_t caps_len = pow(tree_config.arity, cap_height) * tree_config.digest_elements;
size_t available_memory, _total_memory;
CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
if (available_memory < (GIGA / 8 + STREAM_CHUNK_SIZE)) {
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument,
"Not enough GPU memory to build a tree. At least 1.125 GB of GPU memory required");
}
available_memory -= GIGA / 8; // Leave 128 MB just in case
// We can effectively parallelize memory copy with streams
// as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
const size_t number_of_streams = std::min((uint64_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
std::vector<cudaStream_t> streams(number_of_streams);
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
}
// If keep_rows is smaller then the remaining top-tree height
// we need to allocate additional memory to store the roots
// of subtrees, in order to proceed from there
bool caps_mode = tree_config.keep_rows && tree_config.keep_rows <= cap_height;
D* caps;
if (caps_mode) { caps = static_cast<D*>(malloc(caps_len * sizeof(D))); }
#ifdef MERKLE_DEBUG
std::cout << "MMCS DEBUG" << std::endl;
std::cout << "====================================" << std::endl;
std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
std::cout << "Number of streams = " << number_of_streams << std::endl;
std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
std::cout << "Height of a subtree = " << subtree_height << std::endl;
std::cout << "Cutoff height = " << tree_height - subtree_height << std::endl;
std::cout << "Number of leaves in a subtree = " << subtree_bottom_layer_rows << std::endl;
std::cout << "State of a subtree = " << subtree_states_size << std::endl;
std::cout << "Digest elements for a subtree = " << subtree_digests_size << std::endl;
std::cout << "Size of 1 subtree states = " << subtree_states_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(D) / 1024 / 1024 << " MB" << std::endl;
std::cout << "Cap height = " << cap_height << std::endl;
std::cout << "Enabling caps mode? " << caps_mode << std::endl;
std::cout << "Allocating " << subtree_states_size * number_of_streams << " elements for states" << std::endl;
std::cout << "Allocating " << subtree_leaves_memory * number_of_streams << " bytes for leaves" << std::endl;
std::cout << "Allocating " << subtree_aux_elements * number_of_streams << " elements for aux leaves" << std::endl;
std::cout << std::endl;
#endif
// Allocate memory for the states, injected leaves (aux) and digests
// These are shared by streams in a pool
D* states_ptr;
L *aux_ptr, *leaves_ptr;
CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_states_size * number_of_streams * sizeof(D), stream));
CHK_IF_RETURN(cudaMemsetAsync(states_ptr, 0, subtree_states_size * number_of_streams * sizeof(D), stream));
CHK_IF_RETURN(cudaMallocAsync(&leaves_ptr, subtree_leaves_memory * number_of_streams, stream));
CHK_IF_RETURN(cudaMallocAsync(&aux_ptr, subtree_aux_elements * number_of_streams * sizeof(L), stream));
// Wait for these allocations to finish
CHK_IF_RETURN(cudaStreamSynchronize(stream));
// Build subtrees in parallel. This for loop invokes kernels that can run in a pool of size `number_of_streams`
for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
size_t stream_idx = subtree_idx % number_of_streams;
cudaStream_t subtree_stream = streams[stream_idx];
D* subtree_state = states_ptr + stream_idx * subtree_states_size;
L* subtree_leaves = (L*)((unsigned char*)leaves_ptr + stream_idx * subtree_leaves_memory);
L* subtree_aux = aux_ptr + stream_idx * subtree_aux_elements;
unsigned int subtree_keep_rows = 0;
if (tree_config.keep_rows) {
int diff = tree_config.keep_rows - cap_height;
subtree_keep_rows = std::max(1, diff);
}
device_context::DeviceContext subtree_context{subtree_stream, tree_config.ctx.device_id, tree_config.ctx.mempool};
SubtreeParams<L, D> params = {};
params.number_of_inputs = number_of_inputs;
params.arity = tree_config.arity;
params.digest_elements = tree_config.digest_elements;
params.number_of_rows = subtree_bottom_layer_rows;
params.number_of_rows_padded = subtree_bottom_layer_rows;
params.subtree_idx = subtree_idx;
params.subtree_height = subtree_height;
params.number_of_subtrees = number_of_subtrees;
params.segment_size = number_of_bottom_layer_rows * tree_config.digest_elements;
params.keep_rows = subtree_keep_rows;
params.are_inputs_on_device = tree_config.are_inputs_on_device;
params.hasher = &hasher;
params.compression = &compression;
params.ctx = &subtree_context;
cudaError_t subtree_result = build_mmcs_subtree<L, D>(
sorted_inputs,
subtree_leaves, // d_leaves
subtree_state, // states
subtree_aux, // aux_leaves_mem
caps_mode ? caps : digests, // big_tree_digests
params // params
);
CHK_IF_RETURN(subtree_result);
}
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
}
// Finish the top-level tree if any
if (cap_height > 0) {
D* digests_ptr = (D*)leaves_ptr;
size_t start_segment_size = caps_len / tree_config.arity;
size_t start_segment_offset = 0;
if (!caps_mode) { // Calculate offset
size_t keep_rows = tree_config.keep_rows ? tree_config.keep_rows : tree_height + 1;
size_t layer_size = pow(tree_config.arity, keep_rows - 1) * tree_config.digest_elements;
for (int i = 0; i < keep_rows - cap_height; i++) {
start_segment_offset += layer_size;
layer_size /= tree_config.arity;
}
}
CHK_IF_RETURN(cudaMemcpyAsync(
states_ptr, caps_mode ? caps : (digests + start_segment_offset - caps_len), caps_len * sizeof(D),
(caps_mode || !tree_config.are_outputs_on_device) ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, stream));
uint64_t number_of_states = caps_len / tree_config.arity / tree_config.digest_elements;
Matrix<L>* d_leaves_info;
CHK_IF_RETURN(cudaMallocAsync(&d_leaves_info, number_of_inputs * sizeof(Matrix<L>), tree_config.ctx.stream));
SubtreeParams<L, D> top_params = {};
top_params.number_of_inputs = number_of_inputs;
top_params.arity = tree_config.arity;
top_params.digest_elements = tree_config.digest_elements;
top_params.number_of_rows = number_of_states;
top_params.number_of_rows_padded = number_of_states;
top_params.subtree_height = cap_height;
top_params.number_of_subtrees = 1;
top_params.segment_offset = start_segment_offset;
top_params.segment_size = start_segment_size;
top_params.keep_rows = tree_config.keep_rows;
top_params.are_inputs_on_device = tree_config.are_inputs_on_device;
top_params.caps_mode = caps_mode;
top_params.hasher = &hasher;
top_params.compression = &compression;
top_params.ctx = &tree_config.ctx;
D* prev_layer = states_ptr;
D* next_layer = digests_ptr;
while (top_params.number_of_rows_padded > 0) {
CHK_IF_RETURN(fold_layer(sorted_inputs, prev_layer, next_layer, aux_ptr, d_leaves_info, top_params));
CHK_IF_RETURN(maybe_copy_digests(next_layer, digests, top_params));
swap<D>(&prev_layer, &next_layer);
top_params.segment_size /= top_params.arity;
top_params.subtree_height--;
top_params.number_of_rows_padded /= top_params.arity;
}
}
if (caps_mode) { free(caps); }
CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
CHK_IF_RETURN(cudaFreeAsync(leaves_ptr, stream));
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
}
if (!tree_config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
} // namespace merkle_tree

View File

@@ -0,0 +1,7 @@
merkle.o
poseidon2.o
test_merkle_poseidon2
merkle_bls.o
poseidon.o
test_merkle_poseidon
test_merkle

View File

@@ -0,0 +1,23 @@
test_merkle_poseidon: poseidon.o merkle_bls.o
nvcc -o test_merkle_poseidon -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -DMERKLE_DEBUG poseidon.o merkle_bls.o test.cu
./test_merkle_poseidon
merkle_bls.o: ../../extern.cu ../../merkle.cu
nvcc -o merkle_bls.o -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -DMERKLE_DEBUG -c ../../extern.cu
poseidon.o: ../../../poseidon/extern.cu
nvcc -o poseidon.o -I../../../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE=bls12_381 -c ../../../poseidon/extern.cu
test_merkle: poseidon2.o merkle.o
nvcc -o test_merkle -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG poseidon2.o merkle.o test_poseidon2.cu
./test_merkle
merkle.o: ../../extern.cu ../../merkle.cu
nvcc -o merkle.o -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG -c ../../extern.cu
poseidon2.o: ../../../poseidon2/extern.cu
nvcc -o poseidon2.o -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -c ../../../poseidon2/extern.cu
clear:
rm -f poseidon2.o merkle.o test_merkle merkle_bls.o poseidon.o test_merkle

View File

@@ -1,10 +1,3 @@
// #define DEBUG
#define MERKLE_DEBUG
#include "curves/curve_config.cuh"
#include "../poseidon.cu"
#include "merkle.cu"
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
@@ -12,15 +5,18 @@
#include <iostream>
#include <math.h>
using namespace poseidon;
using namespace merkle;
using namespace curve_config;
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#include "merkle-tree/merkle.cuh"
#include "poseidon/poseidon.cuh"
#include "api/bls12_381.h"
using namespace bls12_381;
// Arity
#define A 2
#define T (A + 1)
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
@@ -30,24 +26,24 @@ int main(int argc, char* argv[])
// Load poseidon constants
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
PoseidonConstants<scalar_t> constants;
init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
poseidon::Poseidon<scalar_t> poseidon(A, ctx);
END_TIMER(timer_const, "Load poseidon constants");
/// Tree of height N and arity A contains \sum{A^i} for i in 0..N-1 elements
uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 28;
uint32_t number_of_leaves = pow(A, (tree_height - 1));
uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 26;
uint32_t number_of_leaves = pow(A, tree_height);
uint32_t total_number_of_leaves = number_of_leaves * A;
/// Use keep_rows to specify how many rows do you want to store
int keep_rows = argc > 2 ? atoi(argv[2]) : 7;
size_t digests_len = get_digests_len<scalar_t>(keep_rows + 1, A);
size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, A, 1);
/// Fill leaves with scalars [0, 1, ... 2^{tree_height - 1} - 1]
/// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
START_TIMER(timer_allocation);
scalar_t input = scalar_t::zero();
size_t leaves_mem = number_of_leaves * sizeof(scalar_t);
size_t leaves_mem = total_number_of_leaves * sizeof(scalar_t);
scalar_t* leaves = static_cast<scalar_t*>(malloc(leaves_mem));
for (uint32_t i = 0; i < number_of_leaves; i++) {
for (uint32_t i = 0; i < total_number_of_leaves; i++) {
leaves[i] = input;
input = input + scalar_t::one();
}
@@ -62,6 +58,7 @@ int main(int argc, char* argv[])
std::cout << "Memory for leaves = " << leaves_mem / 1024 / 1024 << " MB; " << leaves_mem / 1024 / 1024 / 1024 << " GB"
<< std::endl;
std::cout << "Number of leaves = " << number_of_leaves << std::endl;
std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
<< " GB" << std::endl;
std::cout << "Number of digest elements = " << digests_len << std::endl;
@@ -69,12 +66,17 @@ int main(int argc, char* argv[])
std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
<< (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;
TreeBuilderConfig config = default_merkle_config();
config.keep_rows = keep_rows;
merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
tree_config.arity = 2;
tree_config.keep_rows = keep_rows;
START_TIMER(timer_merkle);
build_merkle_tree<scalar_t, T>(leaves, digests, tree_height, constants, config);
bls12_381_build_merkle_tree(leaves, digests, tree_height, A, &poseidon, &poseidon, tree_config);
END_TIMER(timer_merkle, "Merkle tree built: ")
for (int i = 0; i < digests_len; i++) {
std::cout << digests[i] << std::endl;
}
// Use this to generate test vectors
// for (int i = 0; i < digests_len; i++) {
// std::cout << "{";

View File

@@ -0,0 +1,108 @@
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <math.h>
#include "merkle-tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
#include "api/babybear.h"
using namespace babybear;
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
/// Tree of height N and arity A contains \sum{A^i} for i in 0..N elements
uint32_t tree_arity = 2;
uint32_t width = 16;
uint32_t input_block_len = 8;
uint32_t digest_elements = 8;
uint64_t tree_height = argc > 1 ? atoi(argv[1]) : 23;
uint64_t number_of_leaves = pow(tree_arity, tree_height);
uint64_t total_number_of_leaves = number_of_leaves * input_block_len;
// Load poseidon constants
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
poseidon2::Poseidon2<scalar_t> poseidon(
width, input_block_len, poseidon2::MdsType::DEFAULT_MDS, poseidon2::DiffusionStrategy::DEFAULT_DIFFUSION, ctx);
END_TIMER(timer_const, "Load poseidon constants");
/// Use keep_rows to specify how many rows do you want to store
int keep_rows = argc > 2 ? atoi(argv[2]) : 3;
size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, tree_arity, digest_elements);
/// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
START_TIMER(timer_allocation);
scalar_t input = scalar_t::zero();
size_t leaves_mem = total_number_of_leaves * sizeof(scalar_t);
scalar_t* leaves = static_cast<scalar_t*>(malloc(leaves_mem));
for (uint64_t i = 0; i < total_number_of_leaves; i++) {
leaves[i] = input;
input = input + scalar_t::one();
}
END_TIMER(timer_allocation, "Allocated memory for leaves: ");
/// Allocate memory for digests of {keep_rows} rows of a tree
START_TIMER(timer_digests);
size_t digests_mem = digests_len * sizeof(scalar_t);
scalar_t* digests = static_cast<scalar_t*>(malloc(digests_mem));
END_TIMER(timer_digests, "Allocated memory for digests");
std::cout << "Memory for leaves = " << leaves_mem / 1024 / 1024 << " MB; " << leaves_mem / 1024 / 1024 / 1024 << " GB"
<< std::endl;
std::cout << "Number of leaves = " << number_of_leaves << std::endl;
std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
<< " GB" << std::endl;
std::cout << "Number of digest elements = " << digests_len << std::endl;
std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
<< (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;
merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
tree_config.arity = tree_arity;
tree_config.keep_rows = keep_rows;
tree_config.digest_elements = digest_elements;
START_TIMER(timer_merkle);
babybear_build_merkle_tree(leaves, digests, tree_height, input_block_len, &poseidon, &poseidon, tree_config);
END_TIMER(timer_merkle, "Merkle tree built: ")
for (int i = 0; i < digests_len; i++) {
// std::cout << digests[i] << std::endl;
}
// Use this to generate test vectors
// for (int i = 0; i < digests_len; i++) {
// std::cout << "{";
// for (int j = 0; j < 1; j++) {
// std::cout << ((uint32_t*)&digests[i].limbs_storage)[j];
// }
// std::cout << "}," << std::endl;
// }
scalar_t expected[64] = {
{1198029810}, {1114813365}, {241588005}, {735332587}, {201392606}, {623383436}, {60086186}, {1225304654},
{1501472115}, {891216097}, {184481194}, {855632748}, {1503541944}, {1483537725}, {1023563730}, {698957505},
{1322038939}, {1132881200}, {104782797}, {68847168}, {420051722}, {126069919}, {1350263697}, {1711085395},
{1322038939}, {1132881200}, {104782797}, {68847168}, {420051722}, {126069919}, {1350263697}, {1711085395},
{1019525203}, {127215304}, {1199733491}, {1473997036}, {548538385}, {364347137}, {570748364}, {426431873},
{926562920}, {6278762}, {1894248581}, {1304248433}, {1635020421}, {719342960}, {1373719279}, {700539301},
{708916911}, {925660920}, {994927540}, {1925434995}, {208534303}, {69614512}, {1701199215}, {1825115630}};
for (int i = 0; i < digests_len; i++) {
scalar_t root = digests[i];
assert(root == expected[i]);
}
free(digests);
free(leaves);
}
#endif

View File

@@ -0,0 +1,4 @@
mmcs.o
poseidon2.o
test_mmcs_poseidon2
vec_ops.o

View File

@@ -0,0 +1,15 @@
test_merkle: poseidon2.o mmcs.o vec_ops.o
nvcc -o test_mmcs_poseidon2 -lineinfo -I../../../../include -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG poseidon2.o vec_ops.o mmcs.o test_poseidon2.cu
./test_mmcs_poseidon2
mmcs.o: ../../extern_mmcs.cu ../../mmcs.cu
nvcc -o mmcs.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -DMERKLE_DEBUG -c ../../extern_mmcs.cu
poseidon2.o: ../../../poseidon2/extern.cu
nvcc -o poseidon2.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -c ../../../poseidon2/extern.cu
vec_ops.o:
nvcc -o vec_ops.o -I../../../../include -lineinfo -DFIELD=babybear -DFIELD_ID=1001 -c ../../../vec_ops/extern.cu
clear:
rm -f poseidon2.o mmcs.o vec_ops.o test_mmcs_poseidon2

View File

@@ -0,0 +1,139 @@
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <math.h>
#include "merkle-tree/merkle.cuh"
#include "poseidon2/poseidon2.cuh"
#include "api/babybear.h"
using namespace babybear;
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
/// Tree of height N and arity A contains \sum{A^i} for i in 0..N elements
uint32_t tree_arity = 2;
uint32_t width = 16;
uint32_t input_block_len = 600;
uint32_t rate = 8;
uint32_t digest_elements = 8;
uint32_t copied_matrices = 1;
uint64_t tree_height = argc > 1 ? atoi(argv[1]) : 3;
uint64_t number_of_leaves = pow(tree_arity, tree_height);
uint64_t total_number_of_leaves = number_of_leaves * input_block_len;
bool are_inputs_on_device = true;
// Load poseidon constants
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
poseidon2::Poseidon2<scalar_t> poseidon(
width, rate, poseidon2::MdsType::PLONKY, poseidon2::DiffusionStrategy::MONTGOMERY, ctx);
END_TIMER(timer_const, "Load poseidon constants");
/// Use keep_rows to specify how many rows do you want to store
int keep_rows = argc > 2 ? atoi(argv[2]) : 3;
size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, tree_arity, digest_elements);
/// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
START_TIMER(timer_allocation);
scalar_t input = scalar_t::zero();
// unsigned int number_of_inputs = tree_height * copied_matrices;
unsigned int number_of_inputs = 1;
Matrix<scalar_t>* leaves = static_cast<Matrix<scalar_t>*>(malloc(number_of_inputs * sizeof(Matrix<scalar_t>)));
uint64_t current_matrix_rows = number_of_leaves;
for (int i = 0; i < number_of_inputs; i++) {
uint64_t current_matrix_size = current_matrix_rows * input_block_len;
for (int j = 0; j < copied_matrices; j++) {
scalar_t* matrix = static_cast<scalar_t*>(malloc(current_matrix_size * sizeof(scalar_t)));
for (uint64_t k = 0; k < current_matrix_size; k++) {
matrix[k] = input;
input = input + scalar_t::one();
}
scalar_t* d_matrix;
if (are_inputs_on_device) {
cudaMalloc(&d_matrix, current_matrix_size * sizeof(scalar_t));
cudaMemcpy(d_matrix, matrix, current_matrix_size * sizeof(scalar_t), cudaMemcpyHostToDevice);
}
leaves[i * copied_matrices + j] = {
are_inputs_on_device ? d_matrix : matrix,
input_block_len,
current_matrix_rows,
};
}
current_matrix_rows /= tree_arity;
}
END_TIMER(timer_allocation, "Allocated memory for leaves: ");
/// Allocate memory for digests of {keep_rows} rows of a tree
START_TIMER(timer_digests);
size_t digests_mem = digests_len * sizeof(scalar_t);
scalar_t* digests = static_cast<scalar_t*>(malloc(digests_mem));
END_TIMER(timer_digests, "Allocated memory for digests");
// std::cout << "Memory for leaves = " << total_number_of_leaves * sizeof(scalar_t) / 1024 / 1024 << " MB; " <<
// leaves_mem / 1024 / 1024 / 1024 << " GB"
// << std::endl;
std::cout << "Number of leaves = " << number_of_leaves << std::endl;
std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
<< " GB" << std::endl;
std::cout << "Number of digest elements = " << digests_len << std::endl;
std::cout << std::endl;
// std::cout << "Total RAM consumption = " << (digests_mem + leaves_mem) / 1024 / 1024 << " MB; "
// << (digests_mem + leaves_mem) / 1024 / 1024 / 1024 << " GB" << std::endl;
merkle_tree::TreeBuilderConfig tree_config = merkle_tree::default_merkle_config();
tree_config.are_inputs_on_device = are_inputs_on_device;
tree_config.arity = tree_arity;
tree_config.keep_rows = keep_rows;
tree_config.digest_elements = digest_elements;
START_TIMER(timer_merkle);
babybear_mmcs_commit_cuda(leaves, number_of_inputs, digests, &poseidon, &poseidon, tree_config);
END_TIMER(timer_merkle, "Merkle tree built: ")
for (int i = 0; i < 10; i++) {
std::cout << digests[digests_len - i - 1] << std::endl;
}
// Use this to generate test vectors
// for (int i = 0; i < digests_len; i++) {
// std::cout << "{";
// for (int j = 0; j < 8; j++) {
// std::cout << ((uint64_t*)&digests[i].limbs_storage)[j];
// if (j != 7) { std::cout << ", "; }
// }
// std::cout << "}," << std::endl;
// }
/// These scalars are digests of top-7 rows of a Merkle tree.
/// Arity = 2, Tree height = 28, keep_rows = 7
/// They are aligned in the following format:
/// L-7 L-6 L-5 L-4 L-3 L-2 L-1
/// [0..63, 64..95, 96..111, 112..119, 120..123, 124..125, 126]
scalar_t expected[0] = {};
for (int i = 0; i < digests_len; i++) {
scalar_t root = digests[i];
// assert(root == expected[i]);
}
free(digests);
free(leaves);
}
#endif

View File

@@ -1,2 +1,5 @@
test_poseidon : test.cu poseidon.cu kernels.cu constants.cu nvcc - o test_poseidon - I../../ include - DFIELD_ID =
2 - DCURVE_ID = 2 test.cu./ test_poseidon
test_poseidon: test.cu
nvcc -o test_poseidon -I../../include -DFIELD=bls12_381 -DFIELD_ID=2 -DCURVE_ID=2 -DDEVMODE -DDEBUG extern.cu test.cu
test_poseidon_m31: test_m31.cu
nvcc -o test_poseidon -I../../include -DFIELD=m31 -DFIELD_ID=1003 -DDEVMODE -DDEBUG extern.cu test_m31.cu

View File

@@ -1,4 +1,5 @@
#include "poseidon/poseidon.cuh"
#include "poseidon/constants.cuh"
#include "gpu-utils/device_context.cuh"
/// These are pre-calculated constants for different curves
#include "fields/id.h"
@@ -17,17 +18,25 @@ using namespace poseidon_constants_bw6_761;
#elif FIELD_ID == GRUMPKIN
#include "poseidon/constants/grumpkin_poseidon.h"
using namespace poseidon_constants_grumpkin;
#elif FIELD_ID == M31
#include "poseidon/constants/m31_poseidon.h"
using namespace poseidon_constants_m31;
#endif
namespace poseidon {
template <typename S>
cudaError_t create_optimized_poseidon_constants(
int arity,
int full_rounds_half,
int partial_rounds,
const S* constants,
device_context::DeviceContext& ctx,
PoseidonConstants<S>* poseidon_constants)
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const S* round_constants,
const S* mds_matrix,
const S* non_sparse_matrix,
const S* sparse_matrices,
const S domain_tag,
PoseidonConstants<S>* poseidon_constants,
device_context::DeviceContext& ctx)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = ctx.stream;
@@ -41,24 +50,33 @@ namespace poseidon {
S* d_constants;
CHK_IF_RETURN(cudaMallocAsync(&d_constants, sizeof(S) * constants_len, stream));
S* d_round_constants = d_constants;
S* d_mds_matrix = d_round_constants + round_constants_len;
S* d_non_sparse_matrix = d_mds_matrix + mds_matrix_len;
S* d_sparse_matrices = d_non_sparse_matrix + mds_matrix_len;
// Copy constants
CHK_IF_RETURN(cudaMemcpyAsync(d_constants, constants, sizeof(S) * constants_len, cudaMemcpyHostToDevice, stream));
S* round_constants = d_constants;
S* mds_matrix = round_constants + round_constants_len;
S* non_sparse_matrix = mds_matrix + mds_matrix_len;
S* sparse_matrices = non_sparse_matrix + mds_matrix_len;
// Pick the domain_tag accordinaly
// For now, we only support Merkle tree mode
uint32_t tree_domain_tag_value = 1;
tree_domain_tag_value = (tree_domain_tag_value << (width - 1)) - tree_domain_tag_value;
S domain_tag = S::from(tree_domain_tag_value);
CHK_IF_RETURN(cudaMemcpyAsync(
d_round_constants, round_constants, sizeof(S) * round_constants_len, cudaMemcpyHostToDevice, stream));
CHK_IF_RETURN(
cudaMemcpyAsync(d_mds_matrix, mds_matrix, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice, stream));
CHK_IF_RETURN(cudaMemcpyAsync(
d_non_sparse_matrix, non_sparse_matrix, sizeof(S) * mds_matrix_len, cudaMemcpyHostToDevice, stream));
CHK_IF_RETURN(cudaMemcpyAsync(
d_sparse_matrices, sparse_matrices, sizeof(S) * sparse_matrices_len, cudaMemcpyHostToDevice, stream));
// Make sure all the constants have been copied
CHK_IF_RETURN(cudaStreamSynchronize(stream));
*poseidon_constants = {arity, partial_rounds, full_rounds_half, round_constants,
mds_matrix, non_sparse_matrix, sparse_matrices, domain_tag};
*poseidon_constants = {
arity,
alpha,
partial_rounds,
full_rounds_half,
d_round_constants,
d_mds_matrix,
d_non_sparse_matrix,
d_sparse_matrices,
domain_tag};
return CHK_LAST();
}
@@ -68,8 +86,8 @@ namespace poseidon {
int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* poseidon_constants)
{
CHK_INIT_IF_RETURN();
int full_rounds_half = FULL_ROUNDS_DEFAULT;
int partial_rounds;
unsigned int full_rounds_half = FULL_ROUNDS_DEFAULT;
unsigned int partial_rounds;
unsigned char* constants;
switch (arity) {
case 2:
@@ -94,8 +112,41 @@ namespace poseidon {
}
S* h_constants = reinterpret_cast<S*>(constants);
create_optimized_poseidon_constants(arity, full_rounds_half, partial_rounds, h_constants, ctx, poseidon_constants);
unsigned int width = arity + 1;
unsigned int round_constants_len = width * full_rounds_half * 2 + partial_rounds;
unsigned int mds_matrix_len = width * width;
S* round_constants = h_constants;
S* mds_matrix = round_constants + round_constants_len;
S* non_sparse_matrix = mds_matrix + mds_matrix_len;
S* sparse_matrices = non_sparse_matrix + mds_matrix_len;
// Pick the domain_tag accordinaly
// For now, we only support Merkle tree mode
uint32_t tree_domain_tag_value = 1;
tree_domain_tag_value = (tree_domain_tag_value << (width - 1)) - tree_domain_tag_value;
S domain_tag = S::from(tree_domain_tag_value);
create_optimized_poseidon_constants<S>(
arity, 5, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
domain_tag, poseidon_constants, ctx);
return CHK_LAST();
}
template <typename S>
cudaError_t release_optimized_poseidon_constants(PoseidonConstants<S>* constants, device_context::DeviceContext& ctx)
{
CHK_INIT_IF_RETURN();
CHK_IF_RETURN(cudaFreeAsync(constants->round_constants, ctx.stream));
constants->arity = 0;
constants->partial_rounds = 0;
constants->full_rounds_half = 0;
constants->round_constants = nullptr;
constants->mds_matrix = nullptr;
constants->non_sparse_matrix = nullptr;
constants->sparse_matrices = nullptr;
return CHK_LAST();
}
} // namespace poseidon

View File

@@ -2,58 +2,68 @@
using namespace field_config;
#include "poseidon.cu"
#include "poseidon/poseidon.cuh"
#include "constants.cu"
#include "gpu-utils/device_context.cuh"
#include "utils/utils.h"
namespace poseidon {
/**
* Extern "C" version of [poseidon_hash_cuda] function with the following
* value of template parameter (where the field is given by `-DFIELD` env variable during build):
* - `S` is the [field](@ref scalar_t) - either a scalar field of the elliptic curve or a
* stand-alone "STARK field";
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_cuda)(
scalar_t* input,
scalar_t* output,
int number_of_states,
int arity,
const PoseidonConstants<scalar_t>& constants,
PoseidonConfig& config)
typedef class Poseidon<scalar_t> PoseidonInst;
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_create_cuda)(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t& domain_tag,
device_context::DeviceContext& ctx)
{
switch (arity) {
case 2:
return poseidon_hash<scalar_t, 3>(input, output, number_of_states, constants, config);
case 4:
return poseidon_hash<scalar_t, 5>(input, output, number_of_states, constants, config);
case 8:
return poseidon_hash<scalar_t, 9>(input, output, number_of_states, constants, config);
case 11:
return poseidon_hash<scalar_t, 12>(input, output, number_of_states, constants, config);
default:
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 4, 8, 11]");
try {
*poseidon = new PoseidonInst(
arity, alpha, partial_rounds, full_rounds_half, round_constants, mds_matrix, non_sparse_matrix, sparse_matrices,
domain_tag, ctx);
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
return CHK_LAST();
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_optimized_poseidon_constants_cuda)(
int arity,
int full_rounds_half,
int partial_rounds,
const scalar_t* constants,
device_context::DeviceContext& ctx,
PoseidonConstants<scalar_t>* poseidon_constants)
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_load_cuda)(
PoseidonInst** poseidon, unsigned int arity, device_context::DeviceContext& ctx)
{
return create_optimized_poseidon_constants<scalar_t>(
arity, full_rounds_half, partial_rounds, constants, ctx, poseidon_constants);
try {
*poseidon = new PoseidonInst(arity, ctx);
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_optimized_poseidon_constants_cuda)(
int arity, device_context::DeviceContext& ctx, PoseidonConstants<scalar_t>* constants)
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_hash_many_cuda)(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
const SpongeConfig& cfg)
{
return init_optimized_poseidon_constants<scalar_t>(arity, ctx, constants);
return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon_delete_cuda)(PoseidonInst* poseidon)
{
try {
poseidon->~Poseidon();
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
}
} // namespace poseidon

View File

@@ -1,90 +0,0 @@
#include "fields/field_config.cuh"
using namespace field_config;
#include "poseidon/poseidon.cuh"
#include "kernels.cu"
namespace poseidon {
template <typename S, int T>
cudaError_t
permute_many(S* states, size_t number_of_states, const PoseidonConstants<S>& constants, cudaStream_t& stream)
{
size_t rc_offset = 0;
full_rounds<S, T><<<
PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads,
sizeof(S) * PKC<T>::hashes_per_block * T, stream>>>(
states, number_of_states, rc_offset, FIRST_FULL_ROUNDS, constants);
rc_offset += T * (constants.full_rounds_half + 1);
partial_rounds<S, T>
<<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
states, number_of_states, rc_offset, constants);
rc_offset += constants.partial_rounds;
full_rounds<S, T><<<
PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads,
sizeof(S) * PKC<T>::hashes_per_block * T, stream>>>(
states, number_of_states, rc_offset, SECOND_FULL_ROUNDS, constants);
return CHK_LAST();
}
template <typename S, int T>
cudaError_t poseidon_hash(
S* input, S* output, size_t number_of_states, const PoseidonConstants<S>& constants, const PoseidonConfig& config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
S* states;
if (config.input_is_a_state) {
states = input;
} else {
// allocate memory for {number_of_states} states of {t} scalars each
CHK_IF_RETURN(cudaMallocAsync(&states, number_of_states * T * sizeof(S), stream))
// This is where the input matrix of size Arity x NumberOfBlocks is
// padded and copied to device in a T x NumberOfBlocks matrix
CHK_IF_RETURN(cudaMemcpy2DAsync(
states, T * sizeof(S), // Device pointer and device pitch
input, (T - 1) * sizeof(S), // Host pointer and pitch
(T - 1) * sizeof(S), number_of_states, // Size of the source matrix (Arity x NumberOfBlocks)
cudaMemcpyHostToDevice, stream));
}
S* output_device;
if (config.are_outputs_on_device) {
output_device = output;
} else {
CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_states * sizeof(S), stream))
}
prepare_poseidon_states<S, T>
<<<PKC<T>::number_of_full_blocks(number_of_states), PKC<T>::number_of_threads, 0, stream>>>(
states, number_of_states, constants.domain_tag, config.aligned);
cudaError_t hash_error = permute_many<S, T>(states, number_of_states, constants, stream);
CHK_IF_RETURN(hash_error);
get_hash_results<S, T>
<<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
states, number_of_states, output_device);
if (config.loop_state) {
copy_recursive<S, T>
<<<PKC<T>::number_of_singlehash_blocks(number_of_states), PKC<T>::singlehash_block_size, 0, stream>>>(
states, number_of_states, output_device);
}
if (!config.input_is_a_state) CHK_IF_RETURN(cudaFreeAsync(states, stream));
if (!config.are_outputs_on_device) {
CHK_IF_RETURN(
cudaMemcpyAsync(output, output_device, number_of_states * sizeof(S), cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
}
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
} // namespace poseidon

View File

@@ -4,7 +4,6 @@
using namespace curve_config;
#include "gpu-utils/device_context.cuh"
#include "poseidon.cu"
#ifndef __CUDA_ARCH__
#include <cassert>
@@ -12,6 +11,10 @@ using namespace curve_config;
#include <fstream>
#include <iostream>
#include "api/bls12_381.h"
using namespace bls12_381;
#include "poseidon/poseidon.cuh"
using namespace poseidon;
#define A 2
@@ -29,8 +32,7 @@ int main(int argc, char* argv[])
// Load poseidon constants
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
PoseidonConstants<scalar_t> constants;
init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
Poseidon<scalar_t> poseidon(A, ctx);
END_TIMER(timer_const, "Load poseidon constants");
START_TIMER(allocation_timer);
@@ -46,9 +48,10 @@ int main(int argc, char* argv[])
scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
SpongeConfig cfg = default_sponge_config();
START_TIMER(poseidon_timer);
PoseidonConfig config = default_poseidon_config(T);
poseidon_hash<curve_config::scalar_t, T>(in_ptr, out_ptr, number_of_blocks, constants, config);
poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, A, 1, cfg);
END_TIMER(poseidon_timer, "Poseidon")
scalar_t expected[1024] = {
@@ -1080,7 +1083,7 @@ int main(int argc, char* argv[])
if (number_of_blocks == 1024) {
for (int i = 0; i < number_of_blocks; i++) {
#ifdef DEBUG
std::cout << out_ptr[i] << std::endl;
// std::cout << out_ptr[i] << std::endl;
#endif
assert((out_ptr[i] == expected[i]));
}

View File

@@ -0,0 +1,70 @@
// #define DEBUG
#include "fields/field_config.cuh"
using namespace field_config;
#include "gpu-utils/device_context.cuh"
#include "poseidon/poseidon.cuh"
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
using namespace poseidon;
#define A 11
#define T (A + 1)
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
// Load poseidon constants
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
PoseidonConstants<scalar_t> constants;
init_optimized_poseidon_constants<scalar_t>(A, ctx, &constants);
END_TIMER(timer_const, "Load poseidon constants");
START_TIMER(allocation_timer);
// Prepare input data of [0, 1, 2 ... (number_of_blocks * arity) - 1]
int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
scalar_t input = scalar_t::zero();
scalar_t* in_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * A * sizeof(scalar_t)));
for (uint32_t i = 0; i < number_of_blocks * A; i++) {
in_ptr[i] = input;
input = input + scalar_t::one();
}
END_TIMER(allocation_timer, "Allocate mem and fill input");
scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
START_TIMER(poseidon_timer);
PoseidonConfig config = default_poseidon_config(T);
poseidon_hash<field_config::scalar_t, T>(in_ptr, out_ptr, number_of_blocks, constants, config);
END_TIMER(poseidon_timer, "Poseidon")
// scalar_t expected[0] = {}
if (number_of_blocks == 1024) {
for (int i = 0; i < number_of_blocks; i++) {
#ifdef DEBUG
// std::cout << out_ptr[i] << std::endl;
#endif
// assert((out_ptr[i] == expected[i]));
}
printf("Expected output matches\n");
}
free(in_ptr);
free(out_ptr);
}
#endif

View File

@@ -1,3 +0,0 @@
test_merkle:
nvcc -o test_merkle -I../../../include -DFIELD_ID=2 -DCURVE_ID=2 test.cu
./test_merkle

View File

@@ -1,284 +0,0 @@
#include "fields/field_config.cuh"
using namespace field_config;
#include "poseidon/tree/merkle.cuh"
namespace merkle {
/// Flattens the tree digests and sum them up to get
/// the memory needed to contain all the digests
template <typename S>
size_t get_digests_len(uint32_t height, uint32_t arity)
{
size_t digests_len = 0;
size_t row_length = 1;
for (int i = 1; i < height; i++) {
digests_len += row_length;
row_length *= arity;
}
return digests_len;
}
/// Constructs merkle subtree without parallelization
/// The digests are aligned sequentially per row
/// Example:
///
/// Big tree:
///
/// 1
/// / \
/// 2 3
/// / \ / \
/// 4 5 6 7
///
/// Subtree 1 Subtree 2
/// 2 3
/// / \ / \
/// 4 5 6 7
///
/// Digests array for subtree 1:
/// [4 5 . . 2 . .]
/// | | |
/// ----- V
/// | Segment (offset = 4, subtree_idx = 0)
/// v
/// Segment (offset = 0, subtree_idx = 0)
///
/// Digests array for subtree 2:
/// [. . 6 7 . 3 .]
/// | |
/// -----
/// |
/// v
/// Segment (offset = 0, subtree_idx = 1)
///
/// Total digests array:
/// [4 5 6 7 2 3 .]
template <typename S, int T>
cudaError_t build_merkle_subtree(
S* state,
S* digests,
size_t subtree_idx,
size_t subtree_height,
S* big_tree_digests,
size_t start_segment_size,
size_t start_segment_offset,
int keep_rows,
const PoseidonConstants<S>& poseidon,
cudaStream_t& stream)
{
int arity = T - 1;
PoseidonConfig config = default_poseidon_config(T);
config.are_inputs_on_device = true;
config.are_outputs_on_device = true;
config.input_is_a_state = true;
config.loop_state = true;
config.ctx.stream = stream;
size_t leaves_size = pow(arity, subtree_height - 1);
uint32_t number_of_blocks = leaves_size / arity;
size_t segment_size = start_segment_size;
size_t segment_offset = start_segment_offset;
while (number_of_blocks > 0) {
cudaError_t poseidon_res = poseidon_hash<S, T>(state, digests, number_of_blocks, poseidon, config);
CHK_IF_RETURN(poseidon_res);
if (!keep_rows || subtree_height <= keep_rows + 1) {
S* digests_with_offset = big_tree_digests + segment_offset + subtree_idx * number_of_blocks;
CHK_IF_RETURN(
cudaMemcpyAsync(digests_with_offset, digests, number_of_blocks * sizeof(S), cudaMemcpyDeviceToHost, stream));
segment_offset += segment_size;
}
segment_size /= arity;
subtree_height--;
number_of_blocks /= arity;
config.aligned = true;
}
return CHK_LAST();
}
template <typename S, int T>
cudaError_t build_merkle_tree(
const S* leaves,
S* digests,
uint32_t height,
const poseidon::PoseidonConstants<S>& poseidon,
const TreeBuilderConfig& config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
int arity = T - 1;
uint32_t number_of_leaves = pow(arity, (height - 1));
// This will determine how much splitting do we need to do
// `number_of_streams` subtrees should fit in the device
// This means each subtree should fit in `STREAM_CHUNK_SIZE` memory
uint32_t number_of_subtrees = 1;
uint32_t subtree_height = height;
uint32_t subtree_leaves_size = pow(arity, height - 1);
uint32_t subtree_state_size = subtree_leaves_size / arity * T;
uint32_t subtree_digests_size = get_digests_len<S>(subtree_height, arity);
size_t subtree_memory_required = sizeof(S) * (subtree_state_size + subtree_digests_size);
while (subtree_memory_required > STREAM_CHUNK_SIZE) {
number_of_subtrees *= arity;
subtree_height--;
subtree_leaves_size /= arity;
subtree_state_size = subtree_leaves_size / arity * T;
subtree_digests_size = subtree_state_size / arity;
subtree_memory_required = sizeof(S) * (subtree_state_size + subtree_digests_size);
}
int cap_height = height - subtree_height + 1;
size_t caps_len = pow(arity, cap_height - 1);
size_t available_memory, _total_memory;
CHK_IF_RETURN(cudaMemGetInfo(&available_memory, &_total_memory));
available_memory -= GIGA / 8; // Leave 128 MB
// We can effectively parallelize memory copy with streams
// as long as they don't operate on more than `STREAM_CHUNK_SIZE` bytes
const size_t number_of_streams = std::min((uint32_t)(available_memory / STREAM_CHUNK_SIZE), number_of_subtrees);
cudaStream_t* streams = static_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t) * number_of_streams));
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamCreate(&streams[i]));
}
#if !defined(__CUDA_ARCH__) && defined(MERKLE_DEBUG)
std::cout << "Available memory = " << available_memory / 1024 / 1024 << " MB" << std::endl;
std::cout << "Number of streams = " << number_of_streams << std::endl;
std::cout << "Number of subtrees = " << number_of_subtrees << std::endl;
std::cout << "Height of a subtree = " << subtree_height << std::endl;
std::cout << "Cutoff height = " << height - subtree_height + 1 << std::endl;
std::cout << "Number of leaves in a subtree = " << subtree_leaves_size << std::endl;
std::cout << "State of a subtree = " << subtree_state_size << std::endl;
std::cout << "Digest elements for a subtree = " << get_digests_len<S>(subtree_height, arity) << std::endl;
std::cout << "Size of 1 subtree states = " << subtree_state_size * sizeof(S) / 1024 / 1024 << " MB" << std::endl;
std::cout << "Size of 1 subtree digests = " << subtree_digests_size * sizeof(S) / 1024 / 1024 << " MB" << std::endl;
#endif
// Allocate memory for the leaves and digests
// These are shared by streams in a pool
S *states_ptr, *digests_ptr;
CHK_IF_RETURN(cudaMallocAsync(&states_ptr, subtree_state_size * number_of_streams * sizeof(S), stream))
CHK_IF_RETURN(cudaMallocAsync(&digests_ptr, subtree_digests_size * number_of_streams * sizeof(S), stream))
// Wait for these allocations to finish
CHK_IF_RETURN(cudaStreamSynchronize(stream));
bool caps_mode = config.keep_rows && config.keep_rows < cap_height;
S* caps;
if (caps_mode) { caps = static_cast<S*>(malloc(caps_len * sizeof(S))); }
for (size_t subtree_idx = 0; subtree_idx < number_of_subtrees; subtree_idx++) {
size_t stream_idx = subtree_idx % number_of_streams;
cudaStream_t subtree_stream = streams[stream_idx];
const S* subtree_leaves = leaves + subtree_idx * subtree_leaves_size;
S* subtree_state = states_ptr + stream_idx * subtree_state_size;
S* subtree_digests = digests_ptr + stream_idx * subtree_digests_size;
// Copy the first level from RAM / device to device
// The pitch property of cudaMemcpy2D resolves shape differences
CHK_IF_RETURN(cudaMemcpy2DAsync(
subtree_state, T * sizeof(S), // Device pointer and device pitch
subtree_leaves, arity * sizeof(S), // Host pointer and pitch
arity * sizeof(S), // Size of the source matrix (Arity)
subtree_leaves_size / arity, // Size of the source matrix (Number of blocks)
config.are_inputs_on_device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice, subtree_stream));
int subtree_keep_rows = 0;
if (config.keep_rows) {
int diff = config.keep_rows - cap_height + 1;
subtree_keep_rows = diff <= 0 ? 1 : diff;
}
size_t start_segment_size = number_of_leaves / arity;
cudaError_t subtree_result = build_merkle_subtree<S, T>(
subtree_state, // state
subtree_digests, // digests
subtree_idx, // subtree_idx
subtree_height, // subtree_height
caps_mode ? caps : digests, // big_tree_digests
start_segment_size, // start_segment_size
0, // start_segment_offset
subtree_keep_rows, // keep_rows
poseidon, // hash
subtree_stream // stream
);
CHK_IF_RETURN(subtree_result);
}
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
}
// Finish the top-level tree if any
if (cap_height > 1) {
size_t start_segment_size = caps_len / arity;
size_t start_segment_offset = 0;
if (!caps_mode) {
size_t layer_size = pow(arity, config.keep_rows - 1);
for (int i = 0; i < config.keep_rows - cap_height + 1; i++) {
start_segment_offset += layer_size;
layer_size /= arity;
}
}
CHK_IF_RETURN(cudaMemcpy2DAsync(
states_ptr, T * sizeof(S), caps_mode ? caps : (digests + start_segment_offset - caps_len), arity * sizeof(S),
arity * sizeof(S),
caps_len / arity, // Size of the source
cudaMemcpyHostToDevice, stream)); // Direction and stream
cudaError_t top_tree_result = build_merkle_subtree<S, T>(
states_ptr, // state
digests_ptr, // digests
0, // subtree_idx
cap_height, // subtree_height
digests, // big_tree_digests
start_segment_size, // start_segment_size
start_segment_offset, // start_segment_offset
config.keep_rows, // keep_rows
poseidon, // hash
stream // stream
);
CHK_IF_RETURN(top_tree_result);
if (caps_mode) { free(caps); }
}
CHK_IF_RETURN(cudaFreeAsync(states_ptr, stream));
CHK_IF_RETURN(cudaFreeAsync(digests_ptr, stream));
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
for (size_t i = 0; i < number_of_streams; i++) {
CHK_IF_RETURN(cudaStreamSynchronize(streams[i]));
CHK_IF_RETURN(cudaStreamDestroy(streams[i]));
}
free(streams);
return CHK_LAST();
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, build_poseidon_merkle_tree)(
const scalar_t* leaves,
scalar_t* digests,
uint32_t height,
int arity,
PoseidonConstants<scalar_t>& constants,
TreeBuilderConfig& config)
{
switch (arity) {
case 2:
return build_merkle_tree<scalar_t, 3>(leaves, digests, height, constants, config);
case 4:
return build_merkle_tree<scalar_t, 5>(leaves, digests, height, constants, config);
case 8:
return build_merkle_tree<scalar_t, 9>(leaves, digests, height, constants, config);
case 11:
return build_merkle_tree<scalar_t, 12>(leaves, digests, height, constants, config);
default:
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "BuildPoseidonMerkleTree: #arity must be one of [2, 4, 8, 11]");
}
return CHK_LAST();
}
} // namespace merkle

View File

@@ -1,7 +1,5 @@
test_poseidon: test.cu poseidon.cu kernels.cu constants.cu
nvcc -o test_poseidon -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 -DDEVMODE -DDEBUG extern.cu test.cu
./test_poseidon
test_poseidon: test.cu
nvcc -o test_poseidon -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 extern.cu test.cu
test_poseidon_release: test.cu poseidon.cu kernels.cu constants.cu
nvcc -o test_poseidon_release -I../../include -DFIELD=bn254 -DFIELD_ID=1 -DCURVE_ID=1 extern.cu test.cu
./test_poseidon_release
test_poseidon_m31: test_m31.cu
nvcc -o test_poseidon_m31 -I../../include -DFIELD=m31 -DFIELD_ID=1003 extern.cu test_m31.cu

View File

@@ -1,4 +1,5 @@
#include "poseidon2/poseidon2.cuh"
#include "poseidon2/constants.cuh"
#include "gpu-utils/device_context.cuh"
/// These are pre-calculated constants for different curves
#include "fields/id.h"
@@ -20,6 +21,9 @@ using namespace poseidon2_constants_grumpkin;
#elif FIELD_ID == BABY_BEAR
#include "poseidon2/constants/babybear_poseidon2.h"
using namespace poseidon2_constants_babybear;
#elif FIELD_ID == M31
#include "poseidon2/constants/m31_poseidon2.h"
using namespace poseidon2_constants_m31;
#endif
namespace poseidon2 {
@@ -36,7 +40,6 @@ namespace poseidon2 {
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* poseidon_constants)
{
cudaFree(nullptr); // Temporary solution
if (!(alpha == 3 || alpha == 5 || alpha == 7 || alpha == 11)) {
THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "Invalid alpha value");
}
@@ -78,7 +81,6 @@ namespace poseidon2 {
device_context::DeviceContext& ctx,
Poseidon2Constants<S>* poseidon2_constants)
{
cudaFree(nullptr); // Temporary solution
CHK_INIT_IF_RETURN();
#define P2_CONSTANTS_DEF(width) \
@@ -121,7 +123,6 @@ namespace poseidon2 {
cudaError_t release_poseidon2_constants(Poseidon2Constants<S>* constants, device_context::DeviceContext& ctx)
{
CHK_INIT_IF_RETURN();
CHK_IF_RETURN(cudaFreeAsync(constants->round_constants, ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(constants->internal_matrix_diag, ctx.stream));
constants->alpha = 0;

View File

@@ -3,67 +3,71 @@
#include "fields/field_config.cuh"
using namespace field_config;
#include "poseidon.cu"
#include "gpu-utils/error_handler.cuh"
#include "poseidon2/poseidon2.cuh"
#include "./constants.cu"
namespace poseidon2 {
extern "C" cudaError_t CONCAT_EXPAND(FIELD, create_poseidon2_constants_cuda)(
int width,
int alpha,
int internal_rounds,
int external_rounds,
template class Poseidon2<scalar_t>;
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_create_cuda)(
Poseidon2<scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
unsigned int alpha,
unsigned int internal_rounds,
unsigned int external_rounds,
const scalar_t* round_constants,
const scalar_t* internal_matrix_diag,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<scalar_t>* poseidon_constants)
device_context::DeviceContext& ctx)
{
return create_poseidon2_constants<scalar_t>(
width, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type, diffusion, ctx,
poseidon_constants);
try {
*poseidon = new Poseidon2<scalar_t>(
width, rate, alpha, internal_rounds, external_rounds, round_constants, internal_matrix_diag, mds_type,
diffusion, ctx);
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, init_poseidon2_constants_cuda)(
int width,
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_load_cuda)(
Poseidon2<scalar_t>** poseidon,
unsigned int width,
unsigned int rate,
MdsType mds_type,
DiffusionStrategy diffusion,
device_context::DeviceContext& ctx,
Poseidon2Constants<scalar_t>* constants)
device_context::DeviceContext& ctx)
{
return init_poseidon2_constants<scalar_t>(width, mds_type, diffusion, ctx, constants);
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_hash_cuda)(
const scalar_t* input,
scalar_t* output,
int number_of_states,
int width,
const Poseidon2Constants<scalar_t>* constants,
Poseidon2Config* config)
{
#define P2_HASH_T(width) \
case width: \
return poseidon2_hash<scalar_t, width>(input, output, number_of_states, *constants, *config);
switch (width) {
P2_HASH_T(2)
P2_HASH_T(3)
P2_HASH_T(4)
P2_HASH_T(8)
P2_HASH_T(12)
P2_HASH_T(16)
P2_HASH_T(20)
P2_HASH_T(24)
default:
THROW_ICICLE_ERR(
IcicleError_t::InvalidArgument, "PoseidonHash: #arity must be one of [2, 3, 4, 8, 12, 16, 20, 24]");
try {
*poseidon = new Poseidon2<scalar_t>(width, rate, mds_type, diffusion, ctx);
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
return CHK_LAST();
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, release_poseidon2_constants_cuda)(
Poseidon2Constants<scalar_t>* constants, device_context::DeviceContext& ctx)
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_hash_many_cuda)(
const Poseidon2<scalar_t>* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
hash::SpongeConfig& cfg)
{
return release_poseidon2_constants<scalar_t>(constants, ctx);
return poseidon->hash_many(inputs, output, number_of_states, input_block_len, output_len, cfg);
}
extern "C" cudaError_t CONCAT_EXPAND(FIELD, poseidon2_delete_cuda)(Poseidon2<scalar_t>* poseidon)
{
try {
poseidon->~Poseidon2();
return cudaError_t::cudaSuccess;
} catch (const IcicleError& _error) {
return cudaError_t::cudaErrorUnknown;
}
}
} // namespace poseidon2

View File

@@ -1,80 +0,0 @@
#include "poseidon2/poseidon2.cuh"
#include "constants.cu"
#include "kernels.cu"
namespace poseidon2 {
static int poseidon_block_size = 128;
template <typename S, int T>
int poseidon_number_of_blocks(size_t number_of_states)
{
return number_of_states / poseidon_block_size + static_cast<bool>(number_of_states % poseidon_block_size);
}
template <typename S, int T>
cudaError_t permute_many(
const S* states,
S* states_out,
size_t number_of_states,
const Poseidon2Constants<S>& constants,
cudaStream_t& stream)
{
poseidon2_permutation_kernel<S, T>
<<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
states, states_out, number_of_states, constants);
CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}
template <typename S, int T>
cudaError_t poseidon2_hash(
const S* states,
S* output,
size_t number_of_states,
const Poseidon2Constants<S>& constants,
const Poseidon2Config& config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
S* d_states;
if (config.are_states_on_device) {
d_states = const_cast<S*>(states);
} else {
// allocate memory for {number_of_states} states of {t} scalars each
CHK_IF_RETURN(cudaMallocAsync(&d_states, number_of_states * T * sizeof(S), stream))
CHK_IF_RETURN(cudaMemcpyAsync(d_states, states, number_of_states * T * sizeof(S), cudaMemcpyHostToDevice, stream))
}
cudaError_t hash_error = permute_many<S, T>(d_states, d_states, number_of_states, constants, stream);
CHK_IF_RETURN(hash_error);
if (config.mode == PoseidonMode::COMPRESSION) {
S* output_device;
if (config.are_outputs_on_device) {
output_device = output;
} else {
CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_states * sizeof(S), stream))
}
get_hash_results<S, T><<<poseidon_number_of_blocks<S, T>(number_of_states), poseidon_block_size, 0, stream>>>(
d_states, number_of_states, config.output_index, output_device);
CHK_IF_RETURN(cudaPeekAtLastError());
if (!config.are_outputs_on_device) {
CHK_IF_RETURN(
cudaMemcpyAsync(output, output_device, number_of_states * sizeof(S), cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
}
} else {
if (!config.are_states_on_device || !config.are_outputs_on_device) {
CHK_IF_RETURN(
cudaMemcpyAsync(output, d_states, number_of_states * T * sizeof(S), cudaMemcpyDeviceToHost, stream));
}
}
if (!config.are_states_on_device) CHK_IF_RETURN(cudaFreeAsync(d_states, stream));
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
} // namespace poseidon2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,88 @@
#include "gpu-utils/device_context.cuh"
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include "poseidon2/poseidon2.cuh"
using namespace poseidon2;
#include "fields/field_config.cuh"
using namespace field_config;
#include "hash/hash.cuh"
#define T 16
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
int main(int argc, char* argv[])
{
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
// Load poseidon
START_TIMER(timer_const);
device_context::DeviceContext ctx = device_context::get_default_device_context();
Poseidon2<scalar_t> poseidon(T, T, MdsType::DEFAULT_MDS, DiffusionStrategy::DEFAULT_DIFFUSION, ctx);
END_TIMER(timer_const, "Load poseidon constants");
int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
scalar_t* in_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * T * sizeof(scalar_t)));
scalar_t* out_ptr = static_cast<scalar_t*>(malloc(number_of_blocks * sizeof(scalar_t)));
scalar_t input = scalar_t::zero();
hash::SpongeConfig cfg = hash::default_sponge_config();
size_t number_of_repetitions = argc > 2 ? 1 << atoi(argv[2]) : 32;
// Prepare input data of [0, 1, 2 ... (number_of_blocks * arity) - 1]
for (uint32_t i = 0; i < number_of_blocks * T; i++) {
in_ptr[i] = input;
input = input + scalar_t::one();
}
// Warm up
poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, T, 1, cfg);
auto total_time_start = std::chrono::high_resolution_clock::now();
size_t avg_time = 0;
for (int i = 0; i < number_of_repetitions; i++) {
auto poseidon_start = std::chrono::high_resolution_clock::now();
poseidon.hash_many(in_ptr, out_ptr, number_of_blocks, T, 1, cfg);
avg_time += FpMilliseconds(std::chrono::high_resolution_clock::now() - poseidon_start).count();
}
auto total_time = FpMilliseconds(std::chrono::high_resolution_clock::now() - total_time_start).count();
std::cout << "Block size: " << number_of_blocks << std::endl;
std::cout << "Total time: " << total_time << " ms" << std::endl;
std::cout << "Avg time: " << avg_time / number_of_repetitions << " ms" << std::endl;
// for (int i = 0; i < number_of_blocks; i++) {
// std::cout << "{";
// for (int j = 0; j < 8; j++) {
// std::cout << ((uint32_t*)&out_ptr[i].limbs_storage)[j];
// if (j != 7) { std::cout << ", "; }
// }
// std::cout << "}," << std::endl;
// }
if (number_of_blocks == 1024) {
for (int i = 0; i < number_of_blocks; i++) {
#ifdef DEBUG
// std::cout << out_ptr[i] << std::endl;
#endif
// assert((out_ptr[i] == expected[i]));
}
printf("Expected output matches\n");
}
free(in_ptr);
free(out_ptr);
}
#endif

Binary file not shown.

View File

@@ -165,7 +165,7 @@ namespace vec_ops {
E* mat_out,
uint32_t row_size,
uint32_t column_size,
device_context::DeviceContext& ctx,
const device_context::DeviceContext& ctx,
bool on_device,
bool is_async)
{

View File

@@ -77,6 +77,8 @@ FIELDS_CONFIG = {
COMMON_INCLUDES = [
'#include <cuda_runtime.h>',
'#include "gpu-utils/device_context.cuh"',
'#include "merkle-tree/merkle.cuh"',
'#include "matrix/matrix.cuh"'
]
WARN_TEXT = """\
@@ -114,10 +116,9 @@ if __name__ == "__main__":
includes.append('#include "msm/msm.cuh"')
if any(header.name.startswith("vec_ops") for header in headers):
includes.append('#include "vec_ops/vec_ops.cuh"')
if any(header.name.startswith("poseidon") for header in headers):
if any(header.name.startswith("poseidon.h") for header in headers):
includes.append('#include "poseidon/poseidon.cuh"')
includes.append('#include "poseidon/tree/merkle.cuh"')
if any(header.name.startswith("poseidon2") for header in headers):
if any(header.name.startswith("poseidon2.h") for header in headers):
includes.append('#include "poseidon2/poseidon2.cuh"')
contents = WARN_TEXT + INCLUDE_ONCE.format(curve.upper()) + "\n".join(includes) + "\n\n"
@@ -148,10 +149,9 @@ if __name__ == "__main__":
includes.append('#include "ntt/ntt.cuh"')
if any(header.name.startswith("vec_ops") for header in headers):
includes.append('#include "vec_ops/vec_ops.cuh"')
if any(header.name.startswith("poseidon") for header in headers):
if any(header.name.startswith("poseidon.h") for header in headers):
includes.append('#include "poseidon/poseidon.cuh"')
includes.append('#include "poseidon/tree/merkle.cuh"')
if any(header.name.startswith("poseidon2") for header in headers):
if any(header.name.startswith("poseidon2.h") for header in headers):
includes.append('#include "poseidon2/poseidon2.cuh"')
contents = WARN_TEXT + INCLUDE_ONCE.format(field.upper()) + "\n".join(includes) + "\n\n"

View File

@@ -1,94 +0,0 @@
package core
import (
"fmt"
"unsafe"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
)
type PoseidonConfig struct {
/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
Ctx cr.DeviceContext
areInputsOnDevice bool
areOutputsOnDevice bool
///If true, input is considered to be a states vector, holding the preimages in aligned or not aligned format.
///Memory under the input pointer will be used for states. If false, fresh states memory will be allocated and input will be copied into it */
InputIsAState bool
/// If true - input should be already aligned for poseidon permutation.
///* Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
///* not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D) */
Aligned bool
///If true, hash results will also be copied in the input pointer in aligned format
LoopState bool
///Whether to run the Poseidon asynchronously. If set to `true`, the poseidon_hash function will be
///non-blocking and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
///If set to false, the poseidon_hash function will block the current CPU thread. */
IsAsync bool
}
type PoseidonConstants[T any] struct {
Arity int32
PartialRounds int32
FullRoundsHalf int32
RoundConstants unsafe.Pointer
MdsMatrix unsafe.Pointer
NonSparseMatrix unsafe.Pointer
SparseMatrices unsafe.Pointer
DomainTag T
}
func GetDefaultPoseidonConfig() PoseidonConfig {
ctx, _ := cr.GetDefaultDeviceContext()
return PoseidonConfig{
ctx, // Ctx
false, // areInputsOnDevice
false, // areOutputsOnDevice
false, // inputIsAState
false, // aligned
false, // loopState
false, // IsAsync
}
}
func PoseidonCheck[T any](input, output HostOrDeviceSlice, cfg *PoseidonConfig, constants *PoseidonConstants[T], numberOfStates int) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer) {
inputLen, outputLen := input.Len(), output.Len()
arity := int(constants.Arity)
expectedInputLen := arity * numberOfStates
if cfg.InputIsAState {
expectedInputLen += numberOfStates
}
if inputLen != expectedInputLen {
errorString := fmt.Sprintf(
"input is not the right length for the given parameters: %d, should be: %d",
inputLen,
arity*numberOfStates,
)
panic(errorString)
}
if outputLen != numberOfStates {
errorString := fmt.Sprintf(
"output is not the right length for the given parameters: %d, should be: %d",
outputLen,
numberOfStates,
)
panic(errorString)
}
cfg.areInputsOnDevice = input.IsOnDevice()
cfg.areOutputsOnDevice = output.IsOnDevice()
if input.IsOnDevice() {
input.(DeviceSlice).CheckDevice()
}
if output.IsOnDevice() {
output.(DeviceSlice).CheckDevice()
}
cfgPointer := unsafe.Pointer(cfg)
return input.AsUnsafePointer(), output.AsUnsafePointer(), cfgPointer
}

View File

@@ -0,0 +1,105 @@
package core
import (
"fmt"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
)
type SpongeConfig struct {
/// Details related to the device such as its id and stream.
Ctx cr.DeviceContext
areInputsOnDevice bool
areResultsOnDevice bool
InputRate uint32
OutputRate uint32
Offset uint32
/// If true - input should be already aligned for poseidon permutation.
/// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
/// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
RecursiveSqueeze bool
/// If true, hash results will also be copied in the input pointer in aligned format
Aligned bool
/// Whether to run the SpongeHash asynchronously. If set to `true`, the SpongeHash function will be non-blocking
/// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
/// If set to `false`, the SpongeHash function will block the current CPU thread.
IsAsync bool
}
func GetDefaultSpongeConfig() SpongeConfig {
ctx, _ := cr.GetDefaultDeviceContext()
return SpongeConfig{
ctx,
false,
false,
0,
0,
0,
false,
false,
false,
}
}
func SpongeInputCheck(inputs HostOrDeviceSlice, numberOfStates, inputBlockLength, inputRate uint32, ctx *cr.DeviceContext) {
if inputBlockLength > inputRate {
errorString := fmt.Sprintf(
"Input block (%d) can't be greater than input rate (%d)",
inputBlockLength,
inputRate,
)
panic(errorString)
}
inputsSizeExpected := inputBlockLength * numberOfStates
if inputs.Len() < int(inputsSizeExpected) {
errorString := fmt.Sprintf(
"inputs len is %d; but needs to be at least %d",
inputs.Len(),
inputsSizeExpected,
)
panic(errorString)
}
if inputs.IsOnDevice() {
inputs.(DeviceSlice).CheckDevice()
}
}
func SpongeStatesCheck(states DeviceSlice, numberOfStates, width uint32, ctx *cr.DeviceContext) {
statesSizeExpected := width * numberOfStates
if states.Len() < int(statesSizeExpected) {
errorString := fmt.Sprintf(
"inputs len is %d; but needs to be at least %d",
states.Len(),
statesSizeExpected,
)
panic(errorString)
}
states.CheckDevice()
}
func SpongeOutputsCheck(outputs HostOrDeviceSlice, numberOfStates, outputLen, width uint32, recursive bool, ctx *cr.DeviceContext) {
var outputsSizeExpected uint32
if recursive {
outputsSizeExpected = width * numberOfStates
} else {
outputsSizeExpected = outputLen * numberOfStates
}
if outputs.Len() < int(outputsSizeExpected) {
errorString := fmt.Sprintf(
"outputs len is %d; but needs to be at least %d",
outputs.Len(),
outputsSizeExpected,
)
panic(errorString)
}
if outputs.IsOnDevice() {
outputs.(DeviceSlice).CheckDevice()
}
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t bls12_377_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t bls12_377_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t bls12_377_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t bls12_377_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t bls12_377_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t bls12_377_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t bls12_377_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -3,55 +3,85 @@ package poseidon
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bls12_377.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bls12_377_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bls12_377_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.bls12_377_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.bls12_377_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.bls12_377_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.bls12_377_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.bls12_377_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -7,6 +7,7 @@ import (
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bls12_377 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12377/poseidon"
"github.com/stretchr/testify/assert"
)
func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[bls12_377.ScalarField]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := bls12_377.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[bls12_377.ScalarField], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t bls12_381_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t bls12_381_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t bls12_381_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t bls12_381_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t bls12_381_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t bls12_381_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t bls12_381_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -3,55 +3,85 @@ package poseidon
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bls12_381 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bls12_381.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bls12_381_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bls12_381_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.bls12_381_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.bls12_381_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.bls12_381_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.bls12_381_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.bls12_381_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -7,29 +7,19 @@ import (
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bls12_381 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bls12381/poseidon"
"fmt"
"github.com/stretchr/testify/assert"
)
func formatOutput(x bls12_381.ScalarField) string {
r := x.GetLimbs()
return fmt.Sprintf("%08x%08x%08x%08x%08x%08x%08x%08x", r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0])
}
func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[bls12_381.ScalarField]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := bls12_381.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -38,18 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[bls12_381.ScalarField], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
expectedString := "48fe0b1331196f6cdb33a7c6e5af61b76fd388e1ef1d3d418be5147f0e4613d4" //This result is from https://github.com/triplewz/poseidon
outputString := formatOutput(output[0])
assert.Equal(t, outputString, expectedString, "Poseidon hash does not match expected result")
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t bn254_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t bn254_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t bn254_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t bn254_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t bn254_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t bn254_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t bn254_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -3,55 +3,85 @@ package poseidon
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bn254.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bn254_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bn254_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.bn254_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.bn254_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.bn254_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.bn254_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.bn254_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -7,6 +7,7 @@ import (
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/poseidon"
"github.com/stretchr/testify/assert"
)
func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[bn254.ScalarField]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := bn254.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[bn254.ScalarField], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t bw6_761_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t bw6_761_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t bw6_761_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t bw6_761_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t bw6_761_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t bw6_761_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t bw6_761_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -3,55 +3,85 @@ package poseidon
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bw6_761 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag bw6_761.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bw6_761_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.bw6_761_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.bw6_761_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.bw6_761_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.bw6_761_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.bw6_761_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.bw6_761_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -7,6 +7,7 @@ import (
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bw6_761 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bw6761/poseidon"
"github.com/stretchr/testify/assert"
)
func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[bw6_761.ScalarField]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := bw6_761.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[bw6_761.ScalarField], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t grumpkin_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t grumpkin_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t grumpkin_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t grumpkin_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t grumpkin_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t grumpkin_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t grumpkin_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -3,55 +3,85 @@ package poseidon
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
grumpkin "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag grumpkin.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.grumpkin_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.grumpkin_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.grumpkin_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.grumpkin_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.grumpkin_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.grumpkin_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.grumpkin_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -7,6 +7,7 @@ import (
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
grumpkin "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/grumpkin/poseidon"
"github.com/stretchr/testify/assert"
)
func TestPoseidon(t *testing.T) {
@@ -14,14 +15,11 @@ func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[grumpkin.ScalarField]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := grumpkin.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -30,13 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[grumpkin.ScalarField], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -3,55 +3,85 @@ package {{.PackageName}}
// #cgo CFLAGS: -I./include/
// #include "poseidon.h"
import "C"
import (
"runtime"
"unsafe"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
{{.Field}} "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}"
)
func GetDefaultPoseidonConfig() core.PoseidonConfig {
return core.GetDefaultPoseidonConfig()
type PoseidonHandler = C.struct_PoseidonInst
type Poseidon struct {
width uint32
handle *PoseidonHandler
}
func PoseidonHash[T any](scalars, results core.HostOrDeviceSlice, numberOfStates int, cfg *core.PoseidonConfig, constants *core.PoseidonConstants[T]) core.IcicleError {
scalarsPointer, resultsPointer, cfgPointer := core.PoseidonCheck(scalars, results, cfg, constants, numberOfStates)
func Create(arity uint32, alpha uint32, fullRoundsHalf uint32, partialRounds uint32, scalars core.HostOrDeviceSlice, mdsMatrix core.HostOrDeviceSlice, nonSparseMatrix core.HostOrDeviceSlice, sparseMatrices core.HostOrDeviceSlice, domainTag {{.Field}}.ScalarField, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cAlpha := (C.uint)(alpha)
cFullRoundsHalf := (C.uint)(fullRoundsHalf)
cPartialRounds := (C.uint)(partialRounds)
cScalars := (*C.scalar_t)(scalars.AsUnsafePointer())
cMdsMatrix := (*C.scalar_t)(mdsMatrix.AsUnsafePointer())
cNonSparseMatrix := (*C.scalar_t)(nonSparseMatrix.AsUnsafePointer())
cSparseMatrices := (*C.scalar_t)(sparseMatrices.AsUnsafePointer())
cDomainTag := (*C.scalar_t)(unsafe.Pointer(&domainTag))
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.{{.Field}}_poseidon_create_cuda(&poseidon, cArity, cAlpha, cFullRoundsHalf, cPartialRounds, cScalars, cMdsMatrix, cNonSparseMatrix, cSparseMatrices, cDomainTag, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
cScalars := (*C.scalar_t)(scalarsPointer)
cResults := (*C.scalar_t)(resultsPointer)
cNumberOfStates := (C.int)(numberOfStates)
cArity := (C.int)(constants.Arity)
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
cCfg := (*C.PoseidonConfig)(cfgPointer)
func Load(arity uint32, ctx *cr.DeviceContext) (*Poseidon, core.IcicleError) {
var poseidon *PoseidonHandler
cArity := (C.uint)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(ctx))
__ret := C.{{.Field}}_poseidon_load_cuda(&poseidon, cArity, cCtx)
err := core.FromCudaError((cr.CudaError)(__ret))
if err.IcicleErrorCode != core.IcicleSuccess {
return nil, err
}
p := Poseidon{handle: poseidon, width: arity + 1}
runtime.SetFinalizer(&p, func(p *Poseidon) {
p.Delete()
})
return &p, err
}
__ret := C.{{.Field}}_poseidon_hash_cuda(cScalars, cResults, cNumberOfStates, cArity, cConstants, cCfg)
func (poseidon *Poseidon) HashMany(inputs core.HostOrDeviceSlice, output core.HostOrDeviceSlice, numberOfStates uint32, inputBlockLen uint32, outputLen uint32, cfg *core.SpongeConfig) core.IcicleError {
core.SpongeInputCheck(inputs, numberOfStates, inputBlockLen, cfg.InputRate, &cfg.Ctx)
core.SpongeOutputsCheck(output, numberOfStates, outputLen, poseidon.width, false, &cfg.Ctx)
cInputs := (*C.scalar_t)(inputs.AsUnsafePointer())
cOutput := (*C.scalar_t)(output.AsUnsafePointer())
cNumberOfStates := (C.uint)(numberOfStates)
cInputBlockLen := (C.uint)(inputBlockLen)
cOutputLen := (C.uint)(outputLen)
cCfg := (*C.SpongeConfig)(unsafe.Pointer(cfg))
__ret := C.{{.Field}}_poseidon_hash_many_cuda(poseidon.handle, cInputs, cOutput, cNumberOfStates, cInputBlockLen, cOutputLen, cCfg)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func CreateOptimizedPoseidonConstants[T any](arity, fullRoundsHalfs, partialRounds int, constants core.HostOrDeviceSlice, ctx cr.DeviceContext, poseidonConstants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cFullRoundsHalfs := (C.int)(fullRoundsHalfs)
cPartialRounds := (C.int)(partialRounds)
cConstants := (*C.scalar_t)(constants.AsUnsafePointer())
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cPoseidonConstants := (*C.PoseidonConstants)(unsafe.Pointer(poseidonConstants))
__ret := C.{{.Field}}_create_optimized_poseidon_constants_cuda(cArity, cFullRoundsHalfs, cPartialRounds, cConstants, cCtx, cPoseidonConstants)
func (poseidon *Poseidon) Delete() core.IcicleError {
__ret := C.{{.Field}}_poseidon_delete_cuda(poseidon.handle)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
}
func InitOptimizedPoseidonConstantsCuda[T any](arity int, ctx cr.DeviceContext, constants *core.PoseidonConstants[T]) core.IcicleError {
cArity := (C.int)(arity)
cCtx := (*C.DeviceContext)(unsafe.Pointer(&ctx))
cConstants := (*C.PoseidonConstants)(unsafe.Pointer(constants))
__ret := C.{{.Field}}_init_optimized_poseidon_constants_cuda(cArity, cCtx, cConstants)
err := (cr.CudaError)(__ret)
return core.FromCudaError(err)
func (poseidon *Poseidon) GetDefaultSpongeConfig() core.SpongeConfig {
cfg := core.GetDefaultSpongeConfig()
cfg.InputRate = poseidon.width - 1
cfg.OutputRate = poseidon.width
return cfg
}

View File

@@ -9,14 +9,40 @@ extern "C" {
#endif
typedef struct scalar_t scalar_t;
typedef struct PoseidonConfig PoseidonConfig;
typedef struct DeviceContext DeviceContext;
typedef struct PoseidonConstants PoseidonConstants;
typedef struct TreeBuilderConfig TreeBuilderConfig;
typedef struct PoseidonInst PoseidonInst;
typedef struct SpongeConfig SpongeConfig;
cudaError_t {{.Field}}_poseidon_hash_cuda(const scalar_t* input, scalar_t* output, int number_of_states, int arity, PoseidonConstants* constants, PoseidonConfig* config);
cudaError_t {{.Field}}_create_optimized_poseidon_constants_cuda(int arity, int full_rounds_halfs, int partial_rounds, const scalar_t* constants, DeviceContext* ctx, PoseidonConstants* poseidon_constants);
cudaError_t {{.Field}}_init_optimized_poseidon_constants_cuda(int arity, DeviceContext* ctx, PoseidonConstants* constants);
cudaError_t {{.Field}}_poseidon_create_cuda(
PoseidonInst** poseidon,
unsigned int arity,
unsigned int alpha,
unsigned int partial_rounds,
unsigned int full_rounds_half,
const scalar_t* round_constants,
const scalar_t* mds_matrix,
const scalar_t* non_sparse_matrix,
const scalar_t* sparse_matrices,
const scalar_t* domain_tag,
DeviceContext* ctx);
cudaError_t {{.Field}}_poseidon_load_cuda(
PoseidonInst** poseidon,
unsigned int arity,
DeviceContext* ctx);
cudaError_t {{.Field}}_poseidon_hash_many_cuda(
const PoseidonInst* poseidon,
const scalar_t* inputs,
scalar_t* output,
unsigned int number_of_states,
unsigned int input_block_len,
unsigned int output_len,
SpongeConfig* cfg);
cudaError_t {{.Field}}_poseidon_delete_cuda(PoseidonInst* poseidon);
#ifdef __cplusplus
}

View File

@@ -2,37 +2,24 @@ package tests
import (
"testing"
core "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
{{.Field}} "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}"
poseidon "github.com/ingonyama-zk/icicle/v2/wrappers/golang/{{.BaseImportPath}}/poseidon"
{{if eq .Field "bls12_381"}}
"fmt"
"github.com/stretchr/testify/assert"
{{end}}
)
{{if eq .Field "bls12_381"}}
func formatOutput(x {{.Field}}.{{.FieldPrefix}}Field) string {
r := x.GetLimbs()
return fmt.Sprintf("%08x%08x%08x%08x%08x%08x%08x%08x", r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0])
}
{{end}}
func TestPoseidon(t *testing.T) {
arity := 2
numberOfStates := 1
cfg := poseidon.GetDefaultPoseidonConfig()
cfg.IsAsync = true
stream, _ := cr.CreateStream()
cfg.Ctx.Stream = &stream
ctx, _ := cr.GetDefaultDeviceContext()
p, err := poseidon.Load(uint32(arity), &ctx)
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
var constants core.PoseidonConstants[{{.Field}}.{{.FieldPrefix}}Field]
poseidon.InitOptimizedPoseidonConstantsCuda(arity, cfg.Ctx, &constants) //generate constants
cfg := p.GetDefaultSpongeConfig()
scalars := {{.Field}}.GenerateScalars(numberOfStates * arity)
scalars[0] = scalars[0].Zero()
@@ -41,19 +28,13 @@ func TestPoseidon(t *testing.T) {
scalarsCopy := core.HostSliceFromElements(scalars[:numberOfStates*arity])
var deviceInput core.DeviceSlice
scalarsCopy.CopyToDeviceAsync(&deviceInput, stream, true)
scalarsCopy.CopyToDevice(&deviceInput, true)
var deviceOutput core.DeviceSlice
deviceOutput.MallocAsync(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement(), stream)
deviceOutput.Malloc(numberOfStates*scalarsCopy.SizeOfElement(), scalarsCopy.SizeOfElement())
poseidon.PoseidonHash(deviceInput, deviceOutput, numberOfStates, &cfg, &constants) //run Hash function
err = p.HashMany(deviceInput, deviceOutput, uint32(numberOfStates), 1, 1, &cfg) //run Hash function
assert.Equal(t, core.IcicleSuccess, err.IcicleErrorCode)
output := make(core.HostSlice[{{.Field}}.{{.FieldPrefix}}Field], numberOfStates)
output.CopyFromDeviceAsync(&deviceOutput, stream)
{{if eq .Field "bls12_381"}}
expectedString := "48fe0b1331196f6cdb33a7c6e5af61b76fd388e1ef1d3d418be5147f0e4613d4" //This result is from https://github.com/triplewz/poseidon
outputString := formatOutput(output[0])
assert.Equal(t, outputString, expectedString, "Poseidon hash does not match expected result")
{{end}}
output := make(core.HostSlice[{{.Field}}.ScalarField], numberOfStates)
output.CopyFromDevice(&deviceOutput)
}

View File

@@ -0,0 +1,136 @@
use std::ffi::c_void;
use icicle_cuda_runtime::{
device::check_device,
device_context::{DeviceContext, DEFAULT_DEVICE_ID},
memory::HostOrDeviceSlice,
};
use crate::ntt::IcicleResult;
/// Struct that encodes Sponge hash parameters.
#[repr(C)]
#[derive(Debug, Clone)]
pub struct SpongeConfig<'a> {
/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
pub ctx: DeviceContext<'a>,
pub(crate) are_inputs_on_device: bool,
pub(crate) are_outputs_on_device: bool,
pub input_rate: u32,
pub output_rate: u32,
pub offset: u32,
/// If true - input should be already aligned for poseidon permutation.
/// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
/// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
pub recursive_squeeze: bool,
/// If true, hash results will also be copied in the input pointer in aligned format
pub aligned: bool,
/// Whether to run the sponge operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
/// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
pub is_async: bool,
}
impl<'a> Default for SpongeConfig<'a> {
fn default() -> Self {
Self::default_for_device(DEFAULT_DEVICE_ID)
}
}
impl<'a> SpongeConfig<'a> {
pub(crate) fn default_for_device(device_id: usize) -> Self {
SpongeConfig {
ctx: DeviceContext::default_for_device(device_id),
are_inputs_on_device: false,
are_outputs_on_device: false,
input_rate: 0,
output_rate: 0,
offset: 0,
recursive_squeeze: false,
aligned: false,
is_async: false,
}
}
}
pub trait SpongeHash<PreImage, Image> {
fn hash_many(
&self,
inputs: &(impl HostOrDeviceSlice<PreImage> + ?Sized),
output: &mut (impl HostOrDeviceSlice<Image> + ?Sized),
number_of_states: usize,
input_block_len: usize,
output_len: usize,
cfg: &SpongeConfig,
) -> IcicleResult<()>;
fn default_config<'a>(&self) -> SpongeConfig<'a>;
fn get_handle(&self) -> *const c_void;
}
pub(crate) fn sponge_check_input<T>(
inputs: &(impl HostOrDeviceSlice<T> + ?Sized),
number_of_states: usize,
input_block_len: usize,
input_rate: usize,
ctx: &DeviceContext,
) {
if input_block_len > input_rate {
panic!(
"input block len ({}) can't be greater than input rate ({})",
input_block_len, input_rate
);
}
let inputs_size_expected = input_block_len * number_of_states;
if inputs.len() < inputs_size_expected {
panic!(
"inputs len is {}; but needs to be at least {}",
inputs.len(),
inputs_size_expected,
);
}
let ctx_device_id = ctx.device_id;
if let Some(device_id) = inputs.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in inputs and context are different"
);
}
check_device(ctx_device_id);
}
pub(crate) fn sponge_check_outputs<T>(
outputs: &(impl HostOrDeviceSlice<T> + ?Sized),
number_of_states: usize,
output_len: usize,
width: usize,
recursive: bool,
ctx: &DeviceContext,
) {
let outputs_size_expected = if recursive {
width * number_of_states
} else {
output_len * number_of_states
};
if outputs.len() < outputs_size_expected {
panic!(
"outputs len is {}; but needs to be at least {}",
outputs.len(),
outputs_size_expected,
);
}
let ctx_device_id = ctx.device_id;
if let Some(device_id) = outputs.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in outputs and context are different"
);
}
check_device(ctx_device_id);
}

View File

@@ -1,7 +1,10 @@
use std::ffi::c_void;
pub mod curve;
pub mod ecntt;
pub mod error;
pub mod field;
pub mod hash;
pub mod msm;
pub mod ntt;
pub mod polynomials;
@@ -18,3 +21,11 @@ where
<Self::ScalarField as traits::FieldImpl>::Config: ntt::NTT<Self::ScalarField, Self::ScalarField>,
{
}
#[repr(C)]
#[derive(Debug)]
pub struct Matrix {
pub values: *const c_void,
pub width: usize,
pub height: usize,
}

View File

@@ -1,212 +1,157 @@
#[doc(hidden)]
pub mod tests;
use icicle_cuda_runtime::{
device::check_device,
device_context::{DeviceContext, DEFAULT_DEVICE_ID},
memory::{DeviceSlice, HostOrDeviceSlice},
use std::{ffi::c_void, marker::PhantomData};
use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice};
use crate::{
error::IcicleResult,
hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
traits::FieldImpl,
};
use crate::{error::IcicleResult, traits::FieldImpl};
#[repr(C)]
pub struct PoseidonConstants<'a, F: FieldImpl> {
arity: u32,
partial_rounds: u32,
full_rounds_half: u32,
/// These should be pointers to data allocated on device
round_constants: &'a DeviceSlice<F>,
mds_matrix: &'a DeviceSlice<F>,
non_sparse_matrix: &'a DeviceSlice<F>,
sparse_matrices: &'a DeviceSlice<F>,
/// Domain tag is the first element in the Poseidon state.
/// For the Merkle tree mode it should equal 2^arity - 1
domain_tag: F,
pub type PoseidonHandle = *const c_void;
pub struct Poseidon<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
width: usize,
handle: PoseidonHandle,
phantom: PhantomData<F>,
}
/// Struct that encodes Poseidon parameters to be passed into the [poseidon_hash_many](poseidon_hash_many) function.
#[repr(C)]
#[derive(Debug, Clone)]
pub struct PoseidonConfig<'a> {
/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
pub ctx: DeviceContext<'a>,
are_inputs_on_device: bool,
are_outputs_on_device: bool,
/// If true, input is considered to be a states vector, holding the preimages
/// in aligned or not aligned format. Memory under the input pointer will be used for states
/// If false, fresh states memory will be allocated and input will be copied into it
pub input_is_a_state: bool,
/// If true - input should be already aligned for poseidon permutation.
/// Aligned format: [0, A, B, 0, C, D, ...] (as you might get by using loop_state)
/// not aligned format: [A, B, 0, C, D, 0, ...] (as you might get from cudaMemcpy2D)
pub aligned: bool,
/// If true, hash results will also be copied in the input pointer in aligned format
pub loop_state: bool,
/// Whether to run Poseidon asynchronously. If set to `true`, Poseidon will be non-blocking
/// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
/// If set to `false`, Poseidon will block the current CPU thread.
pub is_async: bool,
}
impl<'a> Default for PoseidonConfig<'a> {
fn default() -> Self {
Self::default_for_device(DEFAULT_DEVICE_ID)
impl<F> Poseidon<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
pub fn load(arity: usize, ctx: &DeviceContext) -> IcicleResult<Self> {
<<F as FieldImpl>::Config as PoseidonImpl<F>>::load(arity as u32, ctx).and_then(|handle| {
Ok(Self {
width: arity + 1,
handle,
phantom: PhantomData,
})
})
}
}
impl<'a> PoseidonConfig<'a> {
pub fn default_for_device(device_id: usize) -> Self {
Self {
ctx: DeviceContext::default_for_device(device_id),
are_inputs_on_device: false,
are_outputs_on_device: false,
input_is_a_state: false,
aligned: false,
loop_state: false,
is_async: false,
}
}
}
pub trait Poseidon<F: FieldImpl> {
fn create_optimized_constants<'a>(
arity: u32,
pub fn new(
arity: usize,
alpha: u32,
full_rounds_half: u32,
partial_rounds: u32,
constants: &mut [F],
round_constants: &[F],
mds_matrix: &[F],
non_sparse_matrix: &[F],
sparse_matrices: &[F],
domain_tag: F,
ctx: &DeviceContext,
) -> IcicleResult<PoseidonConstants<'a, F>>;
fn load_optimized_constants<'a>(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonConstants<'a, F>>;
fn poseidon_unchecked(
input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
) -> IcicleResult<Self> {
<<F as FieldImpl>::Config as PoseidonImpl<F>>::create(
arity as u32,
alpha,
full_rounds_half,
partial_rounds,
round_constants,
mds_matrix,
non_sparse_matrix,
sparse_matrices,
domain_tag,
ctx,
)
.and_then(|handle| {
Ok(Self {
width: arity + 1,
handle,
phantom: PhantomData,
})
})
}
}
impl<F> SpongeHash<F, F> for Poseidon<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
fn get_handle(&self) -> *const c_void {
self.handle
}
fn hash_many(
&self,
inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: usize,
input_block_len: usize,
output_len: usize,
cfg: &SpongeConfig,
) -> IcicleResult<()> {
sponge_check_input(inputs, number_of_states, input_block_len, self.width - 1, &cfg.ctx);
sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);
let mut local_cfg = cfg.clone();
local_cfg.are_inputs_on_device = inputs.is_on_device();
local_cfg.are_outputs_on_device = output.is_on_device();
<<F as FieldImpl>::Config as PoseidonImpl<F>>::hash_many(
inputs,
output,
number_of_states as u32,
input_block_len as u32,
output_len as u32,
self.handle,
&local_cfg,
)
}
fn default_config<'a>(&self) -> SpongeConfig<'a> {
let mut cfg = SpongeConfig::default();
cfg.input_rate = self.width as u32 - 1;
cfg.output_rate = self.width as u32;
cfg
}
}
impl<F> Drop for Poseidon<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
fn drop(&mut self) {
<<F as FieldImpl>::Config as PoseidonImpl<F>>::delete(self.handle).unwrap();
}
}
pub trait PoseidonImpl<F: FieldImpl> {
fn create(
arity: u32,
alpha: u32,
full_rounds_half: u32,
partial_rounds: u32,
round_constants: &[F],
mds_matrix: &[F],
non_sparse_matrix: &[F],
sparse_matrices: &[F],
domain_tag: F,
ctx: &DeviceContext,
) -> IcicleResult<PoseidonHandle>;
fn load(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonHandle>;
fn hash_many(
inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
arity: u32,
constants: &PoseidonConstants<F>,
config: &PoseidonConfig,
input_block_len: u32,
output_len: u32,
poseidon: PoseidonHandle,
cfg: &SpongeConfig,
) -> IcicleResult<()>;
}
/// Loads pre-calculated poseidon constants on the GPU.
pub fn load_optimized_poseidon_constants<'a, F>(
arity: u32,
ctx: &DeviceContext,
) -> IcicleResult<PoseidonConstants<'a, F>>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon<F>,
{
<<F as FieldImpl>::Config as Poseidon<F>>::load_optimized_constants(arity, ctx)
}
/// Creates new instance of poseidon constants on the GPU.
pub fn create_optimized_poseidon_constants<'a, F>(
arity: u32,
ctx: &DeviceContext,
full_rounds_half: u32,
partial_rounds: u32,
constants: &mut [F],
) -> IcicleResult<PoseidonConstants<'a, F>>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon<F>,
{
<<F as FieldImpl>::Config as Poseidon<F>>::create_optimized_constants(
arity,
full_rounds_half,
partial_rounds,
constants,
ctx,
)
}
/// Computes the poseidon hashes for multiple preimages.
///
/// # Arguments
///
/// * `input` - a pointer to the input data. May point to a vector of preimages or a vector of states filled with preimages.
///
/// * `output` - a pointer to the output data. Must be at least of size [number_of_states](number_of_states)
///
/// * `number_of_states` - number of input blocks of size `arity`
///
/// * `arity` - the arity of the hash function (the size of 1 preimage)
///
/// * `constants` - Poseidon constants.
///
/// * `config` - config used to specify extra arguments of the Poseidon.
pub fn poseidon_hash_many<F>(
input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
arity: u32,
constants: &PoseidonConstants<F>,
config: &PoseidonConfig,
) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon<F>,
{
let input_len_required = if config.input_is_a_state {
number_of_states * (arity + 1)
} else {
number_of_states * arity
};
if input.len() < input_len_required as usize {
panic!(
"input len is {}; but needs to be at least {}",
input.len(),
input_len_required
);
}
if output.len() < number_of_states as usize {
panic!(
"output len is {}; but needs to be at least {}",
output.len(),
number_of_states
);
}
let ctx_device_id = config
.ctx
.device_id;
if let Some(device_id) = input.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in input and context are different"
);
}
if let Some(device_id) = output.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in output and context are different"
);
}
check_device(ctx_device_id);
let mut local_cfg = config.clone();
local_cfg.are_inputs_on_device = input.is_on_device();
local_cfg.are_outputs_on_device = output.is_on_device();
<<F as FieldImpl>::Config as Poseidon<F>>::poseidon_unchecked(
input,
output,
number_of_states,
arity,
constants,
&local_cfg,
)
fn delete(poseidon: PoseidonHandle) -> IcicleResult<()>;
}
#[macro_export]
@@ -218,91 +163,110 @@ macro_rules! impl_poseidon {
$field_config:ident
) => {
mod $field_prefix_ident {
use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, PoseidonConfig, PoseidonConstants};
use crate::poseidon::{$field, $field_config, CudaError, DeviceContext, PoseidonHandle, SpongeConfig};
extern "C" {
#[link_name = concat!($field_prefix, "_create_optimized_poseidon_constants_cuda")]
pub(crate) fn _create_optimized_constants(
#[link_name = concat!($field_prefix, "_poseidon_create_cuda")]
pub(crate) fn create(
poseidon: *mut PoseidonHandle,
arity: u32,
alpha: u32,
full_rounds_half: u32,
partial_rounds: u32,
constants: *mut $field,
round_constants: *const $field,
mds_matrix: *const $field,
non_sparse_matrix: *const $field,
sparse_matrices: *const $field,
domain_tag: $field,
ctx: &DeviceContext,
poseidon_constants: *mut PoseidonConstants<$field>,
) -> CudaError;
#[link_name = concat!($field_prefix, "_init_optimized_poseidon_constants_cuda")]
pub(crate) fn _load_optimized_constants(
arity: u32,
ctx: &DeviceContext,
constants: *mut PoseidonConstants<$field>,
) -> CudaError;
#[link_name = concat!($field_prefix, "_poseidon_load_cuda")]
pub(crate) fn load(poseidon: *mut PoseidonHandle, arity: u32, ctx: &DeviceContext) -> CudaError;
#[link_name = concat!($field_prefix, "_poseidon_hash_cuda")]
#[link_name = concat!($field_prefix, "_poseidon_delete_cuda")]
pub(crate) fn delete(poseidon: PoseidonHandle) -> CudaError;
#[link_name = concat!($field_prefix, "_poseidon_hash_many_cuda")]
pub(crate) fn hash_many(
input: *mut $field,
poseidon: PoseidonHandle,
inputs: *const $field,
output: *mut $field,
number_of_states: u32,
arity: u32,
constants: &PoseidonConstants<$field>,
config: &PoseidonConfig,
input_block_len: u32,
output_len: u32,
cfg: &SpongeConfig,
) -> CudaError;
}
}
impl Poseidon<$field> for $field_config {
fn create_optimized_constants<'a>(
impl PoseidonImpl<$field> for $field_config {
fn create(
arity: u32,
alpha: u32,
full_rounds_half: u32,
partial_rounds: u32,
constants: &mut [$field],
round_constants: &[$field],
mds_matrix: &[$field],
non_sparse_matrix: &[$field],
sparse_matrices: &[$field],
domain_tag: $field,
ctx: &DeviceContext,
) -> IcicleResult<PoseidonConstants<'a, $field>> {
) -> IcicleResult<PoseidonHandle> {
unsafe {
let mut poseidon_constants = MaybeUninit::<PoseidonConstants<'a, $field>>::uninit();
let err = $field_prefix_ident::_create_optimized_constants(
let mut poseidon = MaybeUninit::<PoseidonHandle>::uninit();
$field_prefix_ident::create(
poseidon.as_mut_ptr(),
arity,
alpha,
full_rounds_half,
partial_rounds,
constants as *mut _ as *mut $field,
round_constants as *const _ as *const $field,
mds_matrix as *const _ as *const $field,
non_sparse_matrix as *const _ as *const $field,
sparse_matrices as *const _ as *const $field,
domain_tag,
ctx,
poseidon_constants.as_mut_ptr(),
)
.wrap();
err.and(Ok(poseidon_constants.assume_init()))
.wrap()
.and(Ok(poseidon.assume_init()))
}
}
fn load_optimized_constants<'a>(
arity: u32,
ctx: &DeviceContext,
) -> IcicleResult<PoseidonConstants<'a, $field>> {
fn load(arity: u32, ctx: &DeviceContext) -> IcicleResult<PoseidonHandle> {
unsafe {
let mut constants = MaybeUninit::<PoseidonConstants<'a, $field>>::uninit();
let err = $field_prefix_ident::_load_optimized_constants(arity, ctx, constants.as_mut_ptr()).wrap();
err.and(Ok(constants.assume_init()))
let mut poseidon = MaybeUninit::<PoseidonHandle>::uninit();
$field_prefix_ident::load(poseidon.as_mut_ptr(), arity, ctx)
.wrap()
.and(Ok(poseidon.assume_init()))
}
}
fn poseidon_unchecked(
input: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
fn hash_many(
inputs: &(impl HostOrDeviceSlice<$field> + ?Sized),
output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
number_of_states: u32,
arity: u32,
constants: &PoseidonConstants<$field>,
config: &PoseidonConfig,
input_block_len: u32,
output_len: u32,
poseidon: PoseidonHandle,
cfg: &SpongeConfig,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::hash_many(
input.as_mut_ptr(),
poseidon,
inputs.as_ptr(),
output.as_mut_ptr(),
number_of_states,
arity,
constants,
config,
input_block_len,
output_len,
cfg,
)
.wrap()
}
}
fn delete(poseidon: PoseidonHandle) -> IcicleResult<()> {
unsafe { $field_prefix_ident::delete(poseidon).wrap() }
}
}
};
}
@@ -318,18 +282,3 @@ macro_rules! impl_poseidon_tests {
}
};
}
#[macro_export]
macro_rules! impl_poseidon_custom_config_test {
(
$field:ident,
$field_bytes:literal,
$field_prefix:literal,
$partial_rounds:literal
) => {
#[test]
fn test_poseidon_custom_config() {
check_poseidon_custom_config::<$field>($field_bytes, $field_prefix, $partial_rounds)
}
};
}

View File

@@ -1,105 +1,48 @@
use crate::hash::SpongeHash;
use crate::traits::FieldImpl;
use icicle_cuda_runtime::device_context::DeviceContext;
use icicle_cuda_runtime::memory::{HostOrDeviceSlice, HostSlice};
use std::io::Read;
use std::path::PathBuf;
use std::{env, fs::File};
use super::{Poseidon, PoseidonImpl};
use super::{
create_optimized_poseidon_constants, load_optimized_poseidon_constants, poseidon_hash_many, Poseidon,
PoseidonConfig, PoseidonConstants,
};
pub fn init_poseidon<'a, F: FieldImpl>(arity: u32) -> PoseidonConstants<'a, F>
pub fn init_poseidon<F: FieldImpl>(arity: usize) -> Poseidon<F>
where
<F as FieldImpl>::Config: Poseidon<F>,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
let ctx = DeviceContext::default();
load_optimized_poseidon_constants::<F>(arity, &ctx).unwrap()
Poseidon::load(arity, &ctx).unwrap()
}
pub fn _check_poseidon_hash_many<F: FieldImpl>(constants: PoseidonConstants<F>) -> (F, F)
pub fn _check_poseidon_hash_many<F: FieldImpl>(poseidon: Poseidon<F>)
where
<F as FieldImpl>::Config: Poseidon<F>,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
let test_size = 1 << 10;
let arity = 2u32;
let mut inputs = vec![F::one(); test_size * arity as usize];
let arity = poseidon.width - 1;
let mut inputs = vec![F::one(); test_size * arity];
let mut outputs = vec![F::zero(); test_size];
let input_slice = HostSlice::from_mut_slice(&mut inputs);
let output_slice = HostSlice::from_mut_slice(&mut outputs);
let config = PoseidonConfig::default();
poseidon_hash_many::<F>(
input_slice,
output_slice,
test_size as u32,
arity as u32,
&constants,
&config,
)
.unwrap();
let cfg = poseidon.default_config();
poseidon
.hash_many(input_slice, output_slice, test_size, arity, 1, &cfg)
.unwrap();
let a1 = output_slice[0];
let a2 = output_slice[output_slice.len() - 2];
let a2 = output_slice[output_slice.len() - 1];
println!("first: {:?}, last: {:?}", a1, a2);
assert_eq!(a1, a2);
(a1, a2)
}
pub fn check_poseidon_hash_many<F: FieldImpl>()
where
<F as FieldImpl>::Config: Poseidon<F>,
<F as FieldImpl>::Config: PoseidonImpl<F>,
{
for arity in [2, 4] {
let constants = init_poseidon::<F>(arity as u32);
for arity in [2, 4, 8, 11] {
let poseidon = init_poseidon::<F>(arity);
_check_poseidon_hash_many(constants);
_check_poseidon_hash_many(poseidon);
}
}
pub fn check_poseidon_custom_config<F: FieldImpl>(field_bytes: usize, field_prefix: &str, partial_rounds: u32)
where
<F as FieldImpl>::Config: Poseidon<F>,
{
let arity = 2u32;
let constants = init_poseidon::<F>(arity as u32);
let full_rounds_half = 4;
let ctx = DeviceContext::default();
let cargo_manifest_dir = env!("CARGO_MANIFEST_DIR");
let constants_file = PathBuf::from(cargo_manifest_dir)
.join("tests")
.join(format!("{}_constants.bin", field_prefix));
let mut constants_buf = vec![];
File::open(constants_file)
.unwrap()
.read_to_end(&mut constants_buf)
.unwrap();
let mut custom_constants = vec![];
for chunk in constants_buf.chunks(field_bytes) {
custom_constants.push(F::from_bytes_le(chunk));
}
let custom_constants = create_optimized_poseidon_constants::<F>(
arity as u32,
&ctx,
full_rounds_half,
partial_rounds,
&mut custom_constants,
)
.unwrap();
let (a1, a2) = _check_poseidon_hash_many(constants);
let (b1, b2) = _check_poseidon_hash_many(custom_constants);
assert_eq!(a1, b1);
assert_eq!(a2, b2);
}

View File

@@ -1,107 +1,66 @@
#[doc(hidden)]
pub mod tests;
use icicle_cuda_runtime::{
device::check_device,
device_context::{DeviceContext, DEFAULT_DEVICE_ID},
memory::{DeviceSlice, HostOrDeviceSlice},
use std::{ffi::c_void, marker::PhantomData};
use icicle_cuda_runtime::{device_context::DeviceContext, memory::HostOrDeviceSlice};
use crate::{
error::IcicleResult,
hash::{sponge_check_input, sponge_check_outputs, SpongeConfig, SpongeHash},
traits::FieldImpl,
};
use crate::{error::IcicleResult, traits::FieldImpl};
#[repr(C)]
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Copy)]
pub enum DiffusionStrategy {
Default,
Montgomery,
}
#[repr(C)]
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Copy)]
pub enum MdsType {
Default,
Plonky,
}
#[repr(C)]
#[derive(Debug, Clone)]
pub enum PoseidonMode {
Compression,
Permutation,
pub type Poseidon2Handle = *const c_void;
pub struct Poseidon2<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
width: usize,
handle: Poseidon2Handle,
phantom: PhantomData<F>,
}
#[repr(C)]
pub struct Poseidon2Constants<'a, F: FieldImpl> {
width: u32,
alpha: u32,
internal_rounds: u32,
external_rounds: u32,
round_constants: &'a DeviceSlice<F>,
inernal_matrix_diag: &'a DeviceSlice<F>,
pub mds_type: MdsType,
pub diffusion: DiffusionStrategy,
}
impl<F: FieldImpl> std::fmt::Debug for Poseidon2Constants<'_, F> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}, {}, {}, {}",
self.width, self.alpha, self.internal_rounds, self.external_rounds
)
impl<F> Poseidon2<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
pub fn load(
width: usize,
rate: usize,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Self> {
<<F as FieldImpl>::Config as Poseidon2Impl<F>>::load(width as u32, rate as u32, mds_type, diffusion, ctx)
.and_then(|handle| {
Ok(Self {
width,
handle,
phantom: PhantomData,
})
})
}
}
/// Struct that encodes Poseidon parameters to be passed into the [poseidon_hash_many](poseidon_hash_many) function.
#[repr(C)]
#[derive(Debug, Clone)]
pub struct Poseidon2Config<'a> {
/// Details related to the device such as its id and stream id. See [DeviceContext](@ref device_context::DeviceContext).
pub ctx: DeviceContext<'a>,
are_states_on_device: bool,
are_outputs_on_device: bool,
pub mode: PoseidonMode,
pub output_index: u32,
/// Whether to run Poseidon asynchronously. If set to `true`, Poseidon will be non-blocking
/// and you'd need to synchronize it explicitly by running `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
/// If set to `false`, Poseidon will block the current CPU thread.
pub is_async: bool,
}
impl<'a> Default for Poseidon2Config<'a> {
fn default() -> Self {
Self::default_for_device(DEFAULT_DEVICE_ID)
}
}
impl<'a> Poseidon2Config<'a> {
pub fn default_for_device(device_id: usize) -> Self {
Self {
ctx: DeviceContext::default_for_device(device_id),
are_states_on_device: false,
are_outputs_on_device: false,
mode: PoseidonMode::Compression,
output_index: 1,
is_async: false,
}
}
}
pub trait Poseidon2<F: FieldImpl> {
fn create_constants<'a>(
width: u32,
pub fn new(
width: usize,
rate: usize,
alpha: u32,
internal_rounds: u32,
external_rounds: u32,
@@ -110,191 +69,122 @@ pub trait Poseidon2<F: FieldImpl> {
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Constants<'a, F>>;
fn load_constants<'a>(
) -> IcicleResult<Self> {
<<F as FieldImpl>::Config as Poseidon2Impl<F>>::create(
width as u32,
rate as u32,
alpha,
internal_rounds,
external_rounds,
round_constants,
internal_matrix_diag,
mds_type,
diffusion,
ctx,
)
.and_then(|handle| {
Ok(Self {
width,
handle,
phantom: PhantomData,
})
})
}
}
impl<F> SpongeHash<F, F> for Poseidon2<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
fn get_handle(&self) -> *const c_void {
self.handle
}
fn hash_many(
&self,
inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: usize,
input_block_len: usize,
output_len: usize,
cfg: &SpongeConfig,
) -> IcicleResult<()> {
sponge_check_input(
inputs,
number_of_states,
input_block_len,
cfg.input_rate as usize,
&cfg.ctx,
);
sponge_check_outputs(output, number_of_states, output_len, self.width, false, &cfg.ctx);
let mut local_cfg = cfg.clone();
local_cfg.are_inputs_on_device = inputs.is_on_device();
local_cfg.are_outputs_on_device = output.is_on_device();
<<F as FieldImpl>::Config as Poseidon2Impl<F>>::hash_many(
inputs,
output,
number_of_states as u32,
input_block_len as u32,
output_len as u32,
self.handle,
&local_cfg,
)
}
fn default_config<'a>(&self) -> SpongeConfig<'a> {
let mut cfg = SpongeConfig::default();
cfg.input_rate = self.width as u32;
cfg.output_rate = self.width as u32;
cfg
}
}
impl<F> Drop for Poseidon2<F>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
fn drop(&mut self) {
<<F as FieldImpl>::Config as Poseidon2Impl<F>>::delete(self.handle).unwrap();
}
}
pub trait Poseidon2Impl<F: FieldImpl> {
fn create(
width: u32,
rate: u32,
alpha: u32,
internal_rounds: u32,
external_rounds: u32,
round_constants: &[F],
internal_matrix_diag: &[F],
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Constants<'a, F>>;
fn poseidon_unchecked(
states: &(impl HostOrDeviceSlice<F> + ?Sized),
) -> IcicleResult<Poseidon2Handle>;
fn load(
width: u32,
rate: u32,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Handle>;
fn hash_many(
inputs: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<F>,
config: &Poseidon2Config,
input_block_len: u32,
output_len: u32,
poseidon: Poseidon2Handle,
cfg: &SpongeConfig,
) -> IcicleResult<()>;
fn poseidon_unchecked_inplace(
states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<F>,
config: &Poseidon2Config,
) -> IcicleResult<()>;
fn release_constants(constants: &Poseidon2Constants<F>, ctx: &DeviceContext) -> IcicleResult<()>;
}
/// Loads pre-calculated poseidon constants on the GPU.
pub fn load_poseidon2_constants<'a, F>(
width: u32,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Constants<'a, F>>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
<<F as FieldImpl>::Config as Poseidon2<F>>::load_constants(width, mds_type, diffusion, ctx)
}
/// Creates new instance of poseidon constants on the GPU.
pub fn create_poseidon2_constants<'a, F>(
width: u32,
alpha: u32,
ctx: &DeviceContext,
internal_rounds: u32,
external_rounds: u32,
round_constants: &mut [F],
internal_matrix_diag: &mut [F],
mds_type: MdsType,
diffusion: DiffusionStrategy,
) -> IcicleResult<Poseidon2Constants<'a, F>>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
<<F as FieldImpl>::Config as Poseidon2<F>>::create_constants(
width,
alpha,
internal_rounds,
external_rounds,
round_constants,
internal_matrix_diag,
mds_type,
diffusion,
ctx,
)
}
fn poseidon_checks<F>(
states: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &(impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
width: u32,
config: &Poseidon2Config,
) where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
if states.len() < (number_of_states * width) as usize {
panic!(
"input len is {}; but needs to be at least {}",
states.len(),
number_of_states * width
);
}
if output.len() < number_of_states as usize {
panic!(
"output len is {}; but needs to be at least {}",
output.len(),
number_of_states
);
}
let ctx_device_id = config
.ctx
.device_id;
if let Some(device_id) = states.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in input and context are different"
);
}
if let Some(device_id) = output.device_id() {
assert_eq!(
device_id, ctx_device_id,
"Device ids in output and context are different"
);
}
check_device(ctx_device_id);
}
/// Computes the poseidon hashes for multiple preimages.
///
/// # Arguments
///
/// * `input` - a pointer to the input data. May point to a vector of preimages or a vector of states filled with preimages.
///
/// * `output` - a pointer to the output data. Must be at least of size [number_of_states](number_of_states)
///
/// * `number_of_states` - number of input blocks of size `arity`
///
/// * `arity` - the arity of the hash function (the size of 1 preimage)
///
/// * `constants` - Poseidon constants.
///
/// * `config` - config used to specify extra arguments of the Poseidon.
pub fn poseidon2_hash_many<F>(
states: &(impl HostOrDeviceSlice<F> + ?Sized),
output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<F>,
config: &Poseidon2Config,
) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
poseidon_checks(states, output, number_of_states, width, config);
let mut local_cfg = config.clone();
local_cfg.are_states_on_device = states.is_on_device();
local_cfg.are_outputs_on_device = output.is_on_device();
<<F as FieldImpl>::Config as Poseidon2<F>>::poseidon_unchecked(
states,
output,
number_of_states,
width,
constants,
&local_cfg,
)
}
pub fn poseidon2_hash_many_inplace<F>(
states: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<F>,
config: &Poseidon2Config,
) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
poseidon_checks(states, states, number_of_states, width, config);
let mut local_cfg = config.clone();
local_cfg.are_states_on_device = states.is_on_device();
local_cfg.are_outputs_on_device = states.is_on_device();
<<F as FieldImpl>::Config as Poseidon2<F>>::poseidon_unchecked_inplace(
states,
number_of_states,
width,
constants,
&local_cfg,
)
}
pub fn release_poseidon2_constants<'a, F>(constants: &Poseidon2Constants<F>, ctx: &DeviceContext) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: Poseidon2<F>,
{
<<F as FieldImpl>::Config as Poseidon2<F>>::release_constants(constants, ctx)
fn delete(poseidon: Poseidon2Handle) -> IcicleResult<()>;
}
#[macro_export]
@@ -307,140 +197,125 @@ macro_rules! impl_poseidon2 {
) => {
mod $field_prefix_ident {
use crate::poseidon2::{
$field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Config,
Poseidon2Constants,
$field, $field_config, CudaError, DeviceContext, DiffusionStrategy, MdsType, Poseidon2Handle,
SpongeConfig,
};
use icicle_core::error::IcicleError;
extern "C" {
#[link_name = concat!($field_prefix, "_create_poseidon2_constants_cuda")]
pub(crate) fn _create_constants(
#[link_name = concat!($field_prefix, "_poseidon2_create_cuda")]
pub(crate) fn create(
poseidon: *mut Poseidon2Handle,
width: u32,
rate: u32,
alpha: u32,
internal_rounds: u32,
external_rounds: u32,
constants: *mut $field,
internal_matrix_diag: *mut $field,
constants: *const $field,
internal_matrix_diag: *const $field,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
poseidon_constants: *mut Poseidon2Constants<$field>,
) -> CudaError;
#[link_name = concat!($field_prefix, "_init_poseidon2_constants_cuda")]
pub(crate) fn _load_constants(
#[link_name = concat!($field_prefix, "_poseidon2_load_cuda")]
pub(crate) fn load(
poseidon: *mut Poseidon2Handle,
width: u32,
rate: u32,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
constants: *mut Poseidon2Constants<$field>,
) -> CudaError;
#[link_name = concat!($field_prefix, "_release_poseidon2_constants_cuda")]
pub(crate) fn _release_constants(
constants: &Poseidon2Constants<$field>,
ctx: &DeviceContext,
) -> CudaError;
#[link_name = concat!($field_prefix, "_poseidon2_delete_cuda")]
pub(crate) fn delete(poseidon: Poseidon2Handle) -> CudaError;
#[link_name = concat!($field_prefix, "_poseidon2_hash_cuda")]
#[link_name = concat!($field_prefix, "_poseidon2_hash_many_cuda")]
pub(crate) fn hash_many(
states: *const $field,
poseidon: Poseidon2Handle,
inputs: *const $field,
output: *mut $field,
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<$field>,
config: &Poseidon2Config,
input_block_len: u32,
output_len: u32,
cfg: &SpongeConfig,
) -> CudaError;
}
}
impl Poseidon2<$field> for $field_config {
fn create_constants<'a>(
impl Poseidon2Impl<$field> for $field_config {
fn create(
width: u32,
rate: u32,
alpha: u32,
internal_rounds: u32,
external_rounds: u32,
round_constants: &mut [$field],
internal_matrix_diag: &mut [$field],
round_constants: &[$field],
internal_matrix_diag: &[$field],
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Constants<'a, $field>> {
) -> IcicleResult<Poseidon2Handle> {
unsafe {
let mut poseidon_constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
let err = $field_prefix_ident::_create_constants(
let mut poseidon = MaybeUninit::<Poseidon2Handle>::uninit();
$field_prefix_ident::create(
poseidon.as_mut_ptr(),
width,
rate,
alpha,
internal_rounds,
external_rounds,
round_constants as *mut _ as *mut $field,
internal_matrix_diag as *mut _ as *mut $field,
round_constants as *const _ as *const $field,
internal_matrix_diag as *const _ as *const $field,
mds_type,
diffusion,
ctx,
poseidon_constants.as_mut_ptr(),
)
.wrap();
err.and(Ok(poseidon_constants.assume_init()))
.wrap()
.and(Ok(poseidon.assume_init()))
}
}
fn load_constants<'a>(
fn load(
width: u32,
rate: u32,
mds_type: MdsType,
diffusion: DiffusionStrategy,
ctx: &DeviceContext,
) -> IcicleResult<Poseidon2Constants<'a, $field>> {
) -> IcicleResult<Poseidon2Handle> {
unsafe {
let mut constants = MaybeUninit::<Poseidon2Constants<'a, $field>>::uninit();
let err =
$field_prefix_ident::_load_constants(width, mds_type, diffusion, ctx, constants.as_mut_ptr())
.wrap();
err.and(Ok(constants.assume_init()))
let mut poseidon = MaybeUninit::<Poseidon2Handle>::uninit();
$field_prefix_ident::load(poseidon.as_mut_ptr(), width, rate, mds_type, diffusion, ctx)
.wrap()
.and(Ok(poseidon.assume_init()))
}
}
fn poseidon_unchecked(
states: &(impl HostOrDeviceSlice<$field> + ?Sized),
fn hash_many(
inputs: &(impl HostOrDeviceSlice<$field> + ?Sized),
output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<$field>,
config: &Poseidon2Config,
input_block_len: u32,
output_len: u32,
poseidon: Poseidon2Handle,
cfg: &SpongeConfig,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::hash_many(
states.as_ptr(),
poseidon,
inputs.as_ptr(),
output.as_mut_ptr(),
number_of_states,
width,
constants,
config,
input_block_len,
output_len,
cfg,
)
.wrap()
}
}
fn poseidon_unchecked_inplace(
states: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
number_of_states: u32,
width: u32,
constants: &Poseidon2Constants<$field>,
config: &Poseidon2Config,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::hash_many(
states.as_ptr(),
states.as_mut_ptr(),
number_of_states,
width,
constants,
config,
)
.wrap()
}
}
fn release_constants<'a>(constants: &Poseidon2Constants<$field>, ctx: &DeviceContext) -> IcicleResult<()> {
unsafe { $field_prefix_ident::_release_constants(constants, ctx).wrap() }
fn delete(poseidon: Poseidon2Handle) -> IcicleResult<()> {
unsafe { $field_prefix_ident::delete(poseidon).wrap() }
}
}
};
@@ -466,42 +341,41 @@ pub mod bench {
};
use crate::{
hash::SpongeHash,
ntt::FieldImpl,
poseidon2::{load_poseidon2_constants, DiffusionStrategy, MdsType},
poseidon2::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Impl},
traits::GenerateRandom,
vec_ops::VecOps,
};
use super::{poseidon2_hash_many, Poseidon2, Poseidon2Config, Poseidon2Constants};
#[allow(unused)]
fn poseidon2_for_bench<'a, F: FieldImpl>(
fn poseidon2_for_bench<F: FieldImpl>(
poseidon: &Poseidon2<F>,
states: &(impl HostOrDeviceSlice<F> + ?Sized),
poseidon2_result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
number_of_states: usize,
width: usize,
constants: &Poseidon2Constants<'a, F>,
config: &Poseidon2Config,
ctx: &DeviceContext,
_seed: u32,
) where
<F as FieldImpl>::Config: Poseidon2<F> + GenerateRandom<F>,
<F as FieldImpl>::Config: VecOps<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F> + GenerateRandom<F>,
{
poseidon2_hash_many(
states,
poseidon2_result,
number_of_states as u32,
width as u32,
constants,
config,
)
.unwrap();
let cfg = poseidon.default_config();
poseidon
.hash_many(
states,
poseidon2_result,
number_of_states,
poseidon.width,
poseidon.width,
&cfg,
)
.unwrap();
}
#[allow(unused)]
pub fn benchmark_poseidon2<F: FieldImpl>(c: &mut Criterion)
where
<F as FieldImpl>::Config: Poseidon2<F> + GenerateRandom<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F> + GenerateRandom<F>,
<F as FieldImpl>::Config: VecOps<F>,
{
use criterion::SamplingMode;
@@ -519,7 +393,7 @@ pub mod bench {
.parse::<u32>()
.unwrap_or(MAX_LOG2);
for test_size_log2 in 13u32..max_log2 + 1 {
for test_size_log2 in 18u32..max_log2 + 1 {
for t in [2, 3, 4, 8, 16, 20, 24] {
let number_of_states = 1 << test_size_log2;
let full_size = t * number_of_states;
@@ -531,31 +405,27 @@ pub mod bench {
let permutation_result_slice = HostSlice::from_mut_slice(&mut permutation_result);
let ctx = DeviceContext::default();
let config = Poseidon2Config::default();
for mds in [MdsType::Default, MdsType::Plonky] {
for diffusion in [DiffusionStrategy::Default, DiffusionStrategy::Montgomery] {
let constants =
load_poseidon2_constants(t as u32, mds.clone(), diffusion.clone(), &ctx).unwrap();
let bench_descr = format!(
"Mds::{:?}; Diffusion::{:?}; Number of states: {}; Width: {}",
mds, diffusion, number_of_states, t
);
group.bench_function(&bench_descr, |b| {
b.iter(|| {
poseidon2_for_bench::<F>(
input,
permutation_result_slice,
number_of_states,
t,
&constants,
&config,
black_box(1),
)
})
});
// }
}
for (mds, diffusion) in [
(MdsType::Default, DiffusionStrategy::Default),
(MdsType::Plonky, DiffusionStrategy::Montgomery),
] {
let poseidon = Poseidon2::<F>::load(t, t, mds, diffusion, &ctx).unwrap();
let bench_descr = format!(
"TestSize: 2**{}, Mds::{:?}, Diffusion::{:?}, Width: {}",
test_size_log2, mds, diffusion, t
);
group.bench_function(&bench_descr, |b| {
b.iter(|| {
poseidon2_for_bench::<F>(
&poseidon,
input,
permutation_result_slice,
number_of_states,
&ctx,
black_box(1),
)
})
});
}
}
}

View File

@@ -1,27 +1,21 @@
use crate::poseidon2::{MdsType, PoseidonMode};
use crate::hash::SpongeHash;
use crate::traits::FieldImpl;
use icicle_cuda_runtime::device_context::DeviceContext;
use icicle_cuda_runtime::memory::{HostOrDeviceSlice, HostSlice};
use super::{
load_poseidon2_constants, poseidon2_hash_many, DiffusionStrategy, Poseidon2, Poseidon2Config, Poseidon2Constants,
};
use super::{DiffusionStrategy, MdsType, Poseidon2, Poseidon2Impl};
pub fn init_poseidon<'a, F: FieldImpl>(
width: u32,
mds_type: MdsType,
diffusion: DiffusionStrategy,
) -> Poseidon2Constants<'a, F>
pub fn init_poseidon<F: FieldImpl>(width: usize, mds_type: MdsType, diffusion: DiffusionStrategy) -> Poseidon2<F>
where
<F as FieldImpl>::Config: Poseidon2<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
let ctx = DeviceContext::default();
load_poseidon2_constants::<F>(width, mds_type, diffusion, &ctx).unwrap()
Poseidon2::load(width, width, mds_type, diffusion, &ctx).unwrap()
}
fn _check_poseidon_hash_many<F: FieldImpl>(width: u32, constants: Poseidon2Constants<F>) -> (F, F)
fn _check_poseidon_hash_many<F: FieldImpl>(width: usize, poseidon: &Poseidon2<F>) -> (F, F)
where
<F as FieldImpl>::Config: Poseidon2<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
let test_size = 1 << 10;
let mut inputs = vec![F::one(); test_size * width as usize];
@@ -30,16 +24,10 @@ where
let input_slice = HostSlice::from_mut_slice(&mut inputs);
let output_slice = HostSlice::from_mut_slice(&mut outputs);
let config = Poseidon2Config::default();
poseidon2_hash_many::<F>(
input_slice,
output_slice,
test_size as u32,
width as u32,
&constants,
&config,
)
.unwrap();
let cfg = poseidon.default_config();
poseidon
.hash_many(input_slice, output_slice, test_size, width, 1, &cfg)
.unwrap();
let a1 = output_slice[0];
let a2 = output_slice[output_slice.len() - 2];
@@ -49,21 +37,22 @@ where
(a1, a2)
}
pub fn check_poseidon_hash_many<'a, F: FieldImpl + 'a>()
pub fn check_poseidon_hash_many<F: FieldImpl>()
where
<F as FieldImpl>::Config: Poseidon2<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
let widths = [2, 3, 4, 8, 12, 16, 20, 24];
let ctx = DeviceContext::default();
for width in widths {
let constants = init_poseidon::<'a, F>(width as u32, MdsType::Default, DiffusionStrategy::Default);
let poseidon = Poseidon2::<F>::load(width, width, MdsType::Default, DiffusionStrategy::Default, &ctx).unwrap();
_check_poseidon_hash_many(width, constants);
_check_poseidon_hash_many(width, &poseidon);
}
}
pub fn check_poseidon_kats<'a, F: FieldImpl>(width: usize, kats: &[F], constants: &Poseidon2Constants<'a, F>)
pub fn check_poseidon_kats<F: FieldImpl>(width: usize, kats: &[F], poseidon: &Poseidon2<F>)
where
<F as FieldImpl>::Config: Poseidon2<F>,
<F as FieldImpl>::Config: Poseidon2Impl<F>,
{
assert_eq!(width, kats.len());
@@ -83,17 +72,11 @@ where
let input_slice = HostSlice::from_mut_slice(&mut inputs);
let output_slice = HostSlice::from_mut_slice(&mut outputs);
let mut config = Poseidon2Config::default();
config.mode = PoseidonMode::Permutation;
poseidon2_hash_many::<F>(
input_slice,
output_slice,
batch_size as u32,
width as u32,
&constants,
&config,
)
.unwrap();
let cfg = poseidon.default_config();
poseidon
.hash_many(input_slice, output_slice, batch_size, width, width, &cfg)
.unwrap();
for (i, val) in output_slice
.iter()

View File

@@ -0,0 +1,79 @@
use icicle_cuda_runtime::memory::HostSlice;
use crate::{error::IcicleResult, ntt::FieldImpl};
use crate::{hash::SpongeHash, Matrix};
use super::TreeBuilderConfig;
pub trait FieldMmcs<F, Compression, Hasher>
where
F: FieldImpl,
Compression: SpongeHash<F, F>,
Hasher: SpongeHash<F, F>,
{
fn mmcs_commit(
leaves: Vec<Matrix>,
digests: &mut HostSlice<F>,
hasher: &Hasher,
compression: &Compression,
config: &TreeBuilderConfig,
) -> IcicleResult<()>;
}
#[macro_export]
macro_rules! impl_mmcs {
(
$field_prefix:literal,
$field_prefix_ident:ident,
$field:ident,
$field_config:ident,
$mmcs:ident
) => {
mod $field_prefix_ident {
use super::*;
use icicle_cuda_runtime::error::CudaError;
extern "C" {
#[link_name = concat!($field_prefix, "_mmcs_commit_cuda")]
pub(crate) fn mmcs_commit_cuda(
leaves: *const Matrix,
number_of_inputs: u32,
digests: *mut $field,
hasher: *const c_void,
compression: *const c_void,
config: &TreeBuilderConfig,
) -> CudaError;
}
}
struct $mmcs;
impl<Compression, Hasher> FieldMmcs<$field, Compression, Hasher> for $mmcs
where
Compression: SpongeHash<$field, $field>,
Hasher: SpongeHash<$field, $field>,
{
fn mmcs_commit(
leaves: Vec<Matrix>,
digests: &mut HostSlice<$field>,
hasher: &Hasher,
compression: &Compression,
config: &TreeBuilderConfig,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::mmcs_commit_cuda(
leaves
.as_slice()
.as_ptr(),
leaves.len() as u32,
digests.as_mut_ptr(),
compression.get_handle(),
hasher.get_handle(),
config,
)
.wrap()
}
}
}
};
}

Some files were not shown because too many files have changed in this diff Show More