mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-08 23:17:54 -05:00
269 lines
9.6 KiB
Plaintext
269 lines
9.6 KiB
Plaintext
#include "../../utils/error_handler.cuh"
|
|
#include "poseidon.cuh"
|
|
|
|
template <typename S>
|
|
__global__ void
|
|
prepare_poseidon_states(S* states, size_t number_of_states, S domain_tag, const PoseidonConfiguration<S> config)
|
|
{
|
|
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
int state_number = idx / config.t;
|
|
if (state_number >= number_of_states) { return; }
|
|
int element_number = idx % config.t;
|
|
|
|
S prepared_element;
|
|
|
|
// Domain separation
|
|
if (element_number == 0) {
|
|
prepared_element = domain_tag;
|
|
} else {
|
|
prepared_element = states[state_number * config.t + element_number - 1];
|
|
}
|
|
|
|
// Add pre-round constant
|
|
prepared_element = prepared_element + config.round_constants[element_number];
|
|
|
|
// Store element in state
|
|
states[idx] = prepared_element;
|
|
}
|
|
|
|
template <typename S>
|
|
__device__ __forceinline__ S sbox_alpha_five(S element)
|
|
{
|
|
S result = S::sqr(element);
|
|
result = S::sqr(result);
|
|
return result * element;
|
|
}
|
|
|
|
template <typename S>
|
|
__device__ S vecs_mul_matrix(S element, S* matrix, int element_number, int vec_number, int size, S* shared_states)
|
|
{
|
|
shared_states[threadIdx.x] = element;
|
|
__syncthreads();
|
|
|
|
element = S::zero();
|
|
for (int i = 0; i < size; i++) {
|
|
element = element + (shared_states[vec_number * size + i] * matrix[i * size + element_number]);
|
|
}
|
|
__syncthreads();
|
|
return element;
|
|
}
|
|
|
|
template <typename S>
|
|
__device__ S full_round(
|
|
S element,
|
|
size_t rc_offset,
|
|
int local_state_number,
|
|
int element_number,
|
|
bool multiply_by_mds,
|
|
bool add_round_constant,
|
|
S* shared_states,
|
|
const PoseidonConfiguration<S> config)
|
|
{
|
|
element = sbox_alpha_five(element);
|
|
if (add_round_constant) { element = element + config.round_constants[rc_offset + element_number]; }
|
|
|
|
// Multiply all the states by mds matrix
|
|
S* matrix = multiply_by_mds ? config.mds_matrix : config.non_sparse_matrix;
|
|
return vecs_mul_matrix(element, matrix, element_number, local_state_number, config.t, shared_states);
|
|
}
|
|
|
|
// Execute full rounds
|
|
template <typename S>
|
|
__global__ void full_rounds(
|
|
S* states, size_t number_of_states, size_t rc_offset, bool first_half, const PoseidonConfiguration<S> config)
|
|
{
|
|
extern __shared__ S shared_states[];
|
|
|
|
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
int state_number = idx / config.t;
|
|
if (state_number >= number_of_states) { return; }
|
|
int local_state_number = threadIdx.x / config.t;
|
|
int element_number = idx % config.t;
|
|
|
|
for (int i = 0; i < config.full_rounds_half - 1; i++) {
|
|
states[idx] =
|
|
full_round(states[idx], rc_offset, local_state_number, element_number, true, true, shared_states, config);
|
|
rc_offset += config.t;
|
|
}
|
|
|
|
states[idx] = full_round(
|
|
states[idx], rc_offset, local_state_number, element_number, !first_half, first_half, shared_states, config);
|
|
}
|
|
|
|
template <typename S>
|
|
__device__ S partial_round(S* state, size_t rc_offset, int round_number, const PoseidonConfiguration<S> config)
|
|
{
|
|
S element = state[0];
|
|
element = sbox_alpha_five(element);
|
|
element = element + config.round_constants[rc_offset];
|
|
|
|
S* sparse_matrix = &config.sparse_matrices[(config.t * 2 - 1) * round_number];
|
|
|
|
state[0] = element * sparse_matrix[0];
|
|
for (int i = 1; i < config.t; i++) {
|
|
state[0] = state[0] + (state[i] * sparse_matrix[i]);
|
|
}
|
|
|
|
for (int i = 1; i < config.t; i++) {
|
|
state[i] = state[i] + (element * sparse_matrix[config.t + i - 1]);
|
|
}
|
|
}
|
|
|
|
// Execute partial rounds
|
|
template <typename S>
|
|
__global__ void
|
|
partial_rounds(S* states, size_t number_of_states, size_t rc_offset, const PoseidonConfiguration<S> config)
|
|
{
|
|
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
if (idx >= number_of_states) { return; }
|
|
|
|
S* state = &states[idx * config.t];
|
|
|
|
for (int i = 0; i < config.partial_rounds; i++) {
|
|
partial_round(state, rc_offset, i, config);
|
|
rc_offset++;
|
|
}
|
|
}
|
|
|
|
// These function is just doing copy from the states to the output
|
|
template <typename S>
|
|
__global__ void get_hash_results(S* states, size_t number_of_states, S* out, int t)
|
|
{
|
|
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
if (idx >= number_of_states) { return; }
|
|
|
|
out[idx] = states[idx * t + 1];
|
|
}
|
|
|
|
template <typename S>
|
|
__host__ cudaError_t
|
|
Poseidon<S>::hash_blocks(const S* inp, size_t blocks, S* out, HashType hash_type, cudaStream_t stream)
|
|
{
|
|
CHK_INIT_IF_RETURN();
|
|
|
|
S* states;
|
|
|
|
// allocate memory for {blocks} states of {t} scalars each
|
|
CHK_IF_RETURN(cudaMallocAsync(&states, blocks * this->t * sizeof(S), stream));
|
|
|
|
// This is where the input matrix of size Arity x NumberOfBlocks is
|
|
// padded and coppied to device in a T x NumberOfBlocks matrix
|
|
CHK_IF_RETURN(cudaMemcpy2DAsync(
|
|
states, this->t * sizeof(S), // Device pointer and device pitch
|
|
inp, (this->t - 1) * sizeof(S), // Host pointer and pitch
|
|
(this->t - 1) * sizeof(S), blocks, // Size of the source matrix (Arity x NumberOfBlocks)
|
|
cudaMemcpyHostToDevice, stream));
|
|
|
|
size_t rc_offset = 0;
|
|
|
|
// The logic behind this is that 1 thread only works on 1 element
|
|
// We have {t} elements in each state, and {blocks} states total
|
|
int number_of_threads = (256 / this->t) * this->t;
|
|
int hashes_per_block = number_of_threads / this->t;
|
|
int total_number_of_threads = blocks * this->t;
|
|
int number_of_blocks =
|
|
total_number_of_threads / number_of_threads + static_cast<bool>(total_number_of_threads % number_of_threads);
|
|
|
|
// The partial rounds operates on the whole state, so we define
|
|
// the parallelism params for processing a single hash preimage per thread
|
|
int singlehash_block_size = 128;
|
|
int number_of_singlehash_blocks = blocks / singlehash_block_size + static_cast<bool>(blocks % singlehash_block_size);
|
|
|
|
// Pick the domain_tag accordinaly
|
|
S domain_tag;
|
|
switch (hash_type) {
|
|
case HashType::ConstInputLen:
|
|
domain_tag = this->const_input_no_pad_domain_tag;
|
|
break;
|
|
|
|
case HashType::MerkleTree:
|
|
domain_tag = this->tree_domain_tag;
|
|
}
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
auto start_time = std::chrono::high_resolution_clock::now();
|
|
#endif
|
|
|
|
// Domain separation and adding pre-round constants
|
|
prepare_poseidon_states<<<number_of_blocks, number_of_threads, 0, stream>>>(states, blocks, domain_tag, this->config);
|
|
rc_offset += this->t;
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
|
std::cout << "Domain separation: " << rc_offset << std::endl;
|
|
// print_buffer_from_cuda<S>(states, blocks * this->t);
|
|
|
|
auto end_time = std::chrono::high_resolution_clock::now();
|
|
auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
|
|
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
#endif
|
|
|
|
// execute half full rounds
|
|
full_rounds<<<number_of_blocks, number_of_threads, this->t * sizeof(S) * hashes_per_block, stream>>>(
|
|
states, blocks, rc_offset, true, this->config);
|
|
rc_offset += this->t * this->config.full_rounds_half;
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
|
std::cout << "Full rounds 1. RCOFFSET: " << rc_offset << std::endl;
|
|
// print_buffer_from_cuda<S>(states, blocks * this->t);
|
|
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
|
|
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
#endif
|
|
|
|
// execute partial rounds
|
|
partial_rounds<<<number_of_singlehash_blocks, singlehash_block_size, 0, stream>>>(
|
|
states, blocks, rc_offset, this->config);
|
|
rc_offset += this->config.partial_rounds;
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
|
std::cout << "Partial rounds. RCOFFSET: " << rc_offset << std::endl;
|
|
// print_buffer_from_cuda<S>(states, blocks * this->t);
|
|
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
|
|
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
#endif
|
|
|
|
// execute half full rounds
|
|
full_rounds<<<number_of_blocks, number_of_threads, this->t * sizeof(S) * hashes_per_block, stream>>>(
|
|
states, blocks, rc_offset, false, this->config);
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
|
std::cout << "Full rounds 2. RCOFFSET: " << rc_offset << std::endl;
|
|
// print_buffer_from_cuda<S>(states, blocks * this->t);
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
|
|
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
#endif
|
|
|
|
// get output
|
|
S* out_device;
|
|
CHK_IF_RETURN(cudaMalloc(&out_device, blocks * sizeof(S)));
|
|
get_hash_results<<<number_of_singlehash_blocks, singlehash_block_size, 0, stream>>>(
|
|
states, blocks, out_device, this->config.t);
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaStreamSynchronize(stream));
|
|
std::cout << "Get hash results" << std::endl;
|
|
end_time = std::chrono::high_resolution_clock::now();
|
|
elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
|
|
std::cout << "Elapsed time: " << elapsed_time.count() << " ms" << std::endl;
|
|
#endif
|
|
CHK_IF_RETURN(cudaMemcpyAsync(out, out_device, blocks * sizeof(S), cudaMemcpyDeviceToHost, stream));
|
|
CHK_IF_RETURN(cudaFreeAsync(out_device, stream));
|
|
CHK_IF_RETURN(cudaFreeAsync(states, stream));
|
|
|
|
#if !defined(__CUDA_ARCH__) && defined(DEBUG)
|
|
CHK_IF_RETURN(cudaDeviceReset());
|
|
#endif
|
|
return CHK_LAST();
|
|
} |