mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-10 07:57:56 -05:00
Add Keccak hash function (#435)
This PR adds support for Keccak-256 and Keccak-512. It only adds them in c++. There is no way of adding rust or golang wrappers rn as it requires having an `icicle-common` create / mod
This commit is contained in:
2
icicle/appUtils/keccak/Makefile
Normal file
2
icicle/appUtils/keccak/Makefile
Normal file
@@ -0,0 +1,2 @@
|
||||
test_keccak: test.cu keccak.cu
|
||||
nvcc -o test_keccak -I. -I../.. test.cu
|
||||
275
icicle/appUtils/keccak/keccak.cu
Normal file
275
icicle/appUtils/keccak/keccak.cu
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "keccak.cuh"
|
||||
|
||||
namespace keccak {
|
||||
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
|
||||
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
|
||||
{ \
|
||||
t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
|
||||
}
|
||||
|
||||
#define THETA( \
|
||||
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
|
||||
s43, s44) \
|
||||
{ \
|
||||
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
|
||||
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
|
||||
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
|
||||
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
|
||||
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
|
||||
s00 ^= t0; \
|
||||
s01 ^= t0; \
|
||||
s02 ^= t0; \
|
||||
s03 ^= t0; \
|
||||
s04 ^= t0; \
|
||||
\
|
||||
s10 ^= t1; \
|
||||
s11 ^= t1; \
|
||||
s12 ^= t1; \
|
||||
s13 ^= t1; \
|
||||
s14 ^= t1; \
|
||||
\
|
||||
s20 ^= t2; \
|
||||
s21 ^= t2; \
|
||||
s22 ^= t2; \
|
||||
s23 ^= t2; \
|
||||
s24 ^= t2; \
|
||||
\
|
||||
s30 ^= t3; \
|
||||
s31 ^= t3; \
|
||||
s32 ^= t3; \
|
||||
s33 ^= t3; \
|
||||
s34 ^= t3; \
|
||||
\
|
||||
s40 ^= t4; \
|
||||
s41 ^= t4; \
|
||||
s42 ^= t4; \
|
||||
s43 ^= t4; \
|
||||
s44 ^= t4; \
|
||||
}
|
||||
|
||||
#define RHOPI( \
|
||||
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
|
||||
s43, s44) \
|
||||
{ \
|
||||
t0 = ROTL64(s10, (uint64_t)1); \
|
||||
s10 = ROTL64(s11, (uint64_t)44); \
|
||||
s11 = ROTL64(s41, (uint64_t)20); \
|
||||
s41 = ROTL64(s24, (uint64_t)61); \
|
||||
s24 = ROTL64(s42, (uint64_t)39); \
|
||||
s42 = ROTL64(s04, (uint64_t)18); \
|
||||
s04 = ROTL64(s20, (uint64_t)62); \
|
||||
s20 = ROTL64(s22, (uint64_t)43); \
|
||||
s22 = ROTL64(s32, (uint64_t)25); \
|
||||
s32 = ROTL64(s43, (uint64_t)8); \
|
||||
s43 = ROTL64(s34, (uint64_t)56); \
|
||||
s34 = ROTL64(s03, (uint64_t)41); \
|
||||
s03 = ROTL64(s40, (uint64_t)27); \
|
||||
s40 = ROTL64(s44, (uint64_t)14); \
|
||||
s44 = ROTL64(s14, (uint64_t)2); \
|
||||
s14 = ROTL64(s31, (uint64_t)55); \
|
||||
s31 = ROTL64(s13, (uint64_t)45); \
|
||||
s13 = ROTL64(s01, (uint64_t)36); \
|
||||
s01 = ROTL64(s30, (uint64_t)28); \
|
||||
s30 = ROTL64(s33, (uint64_t)21); \
|
||||
s33 = ROTL64(s23, (uint64_t)15); \
|
||||
s23 = ROTL64(s12, (uint64_t)10); \
|
||||
s12 = ROTL64(s21, (uint64_t)6); \
|
||||
s21 = ROTL64(s02, (uint64_t)3); \
|
||||
s02 = t0; \
|
||||
}
|
||||
|
||||
#define KHI( \
|
||||
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
|
||||
s43, s44) \
|
||||
{ \
|
||||
t0 = s00 ^ (~s10 & s20); \
|
||||
t1 = s10 ^ (~s20 & s30); \
|
||||
t2 = s20 ^ (~s30 & s40); \
|
||||
t3 = s30 ^ (~s40 & s00); \
|
||||
t4 = s40 ^ (~s00 & s10); \
|
||||
s00 = t0; \
|
||||
s10 = t1; \
|
||||
s20 = t2; \
|
||||
s30 = t3; \
|
||||
s40 = t4; \
|
||||
\
|
||||
t0 = s01 ^ (~s11 & s21); \
|
||||
t1 = s11 ^ (~s21 & s31); \
|
||||
t2 = s21 ^ (~s31 & s41); \
|
||||
t3 = s31 ^ (~s41 & s01); \
|
||||
t4 = s41 ^ (~s01 & s11); \
|
||||
s01 = t0; \
|
||||
s11 = t1; \
|
||||
s21 = t2; \
|
||||
s31 = t3; \
|
||||
s41 = t4; \
|
||||
\
|
||||
t0 = s02 ^ (~s12 & s22); \
|
||||
t1 = s12 ^ (~s22 & s32); \
|
||||
t2 = s22 ^ (~s32 & s42); \
|
||||
t3 = s32 ^ (~s42 & s02); \
|
||||
t4 = s42 ^ (~s02 & s12); \
|
||||
s02 = t0; \
|
||||
s12 = t1; \
|
||||
s22 = t2; \
|
||||
s32 = t3; \
|
||||
s42 = t4; \
|
||||
\
|
||||
t0 = s03 ^ (~s13 & s23); \
|
||||
t1 = s13 ^ (~s23 & s33); \
|
||||
t2 = s23 ^ (~s33 & s43); \
|
||||
t3 = s33 ^ (~s43 & s03); \
|
||||
t4 = s43 ^ (~s03 & s13); \
|
||||
s03 = t0; \
|
||||
s13 = t1; \
|
||||
s23 = t2; \
|
||||
s33 = t3; \
|
||||
s43 = t4; \
|
||||
\
|
||||
t0 = s04 ^ (~s14 & s24); \
|
||||
t1 = s14 ^ (~s24 & s34); \
|
||||
t2 = s24 ^ (~s34 & s44); \
|
||||
t3 = s34 ^ (~s44 & s04); \
|
||||
t4 = s44 ^ (~s04 & s14); \
|
||||
s04 = t0; \
|
||||
s14 = t1; \
|
||||
s24 = t2; \
|
||||
s34 = t3; \
|
||||
s44 = t4; \
|
||||
}
|
||||
|
||||
#define IOTA(element, rc) \
|
||||
{ \
|
||||
element ^= rc; \
|
||||
}
|
||||
|
||||
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
|
||||
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
|
||||
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
|
||||
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
|
||||
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
|
||||
|
||||
__device__ void keccakf(uint64_t s[25])
|
||||
{
|
||||
uint64_t t0, t1, t2, t3, t4;
|
||||
|
||||
for (int i = 0; i < 24; i++) {
|
||||
THETA(
|
||||
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
|
||||
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
|
||||
RHOPI(
|
||||
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
|
||||
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
|
||||
KHI(
|
||||
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
|
||||
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
|
||||
IOTA(s[0], RC[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <int C, int D>
|
||||
__global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
|
||||
{
|
||||
int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (bid >= number_of_blocks) { return; }
|
||||
|
||||
const int r_bits = 1600 - C;
|
||||
const int r_bytes = r_bits / 8;
|
||||
const int d_bytes = D / 8;
|
||||
|
||||
uint8_t* b_input = input + bid * input_block_size;
|
||||
uint8_t* b_output = output + bid * d_bytes;
|
||||
uint64_t state[25] = {}; // Initialize with zeroes
|
||||
|
||||
int input_len = input_block_size;
|
||||
|
||||
// absorb
|
||||
while (input_len >= r_bytes) {
|
||||
// #pragma unroll
|
||||
for (int i = 0; i < r_bytes; i += 8) {
|
||||
state[i / 8] ^= *(uint64_t*)(b_input + i);
|
||||
}
|
||||
keccakf(state);
|
||||
b_input += r_bytes;
|
||||
input_len -= r_bytes;
|
||||
}
|
||||
|
||||
// last block (if any)
|
||||
uint8_t last_block[r_bytes];
|
||||
for (int i = 0; i < input_len; i++) {
|
||||
last_block[i] = b_input[i];
|
||||
}
|
||||
|
||||
// pad 10*1
|
||||
last_block[input_len] = 1;
|
||||
for (int i = 0; i < r_bytes - input_len - 1; i++) {
|
||||
last_block[input_len + i + 1] = 0;
|
||||
}
|
||||
// last bit
|
||||
last_block[r_bytes - 1] |= 0x80;
|
||||
|
||||
// #pragma unroll
|
||||
for (int i = 0; i < r_bytes; i += 8) {
|
||||
state[i / 8] ^= *(uint64_t*)(last_block + i);
|
||||
}
|
||||
keccakf(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < d_bytes; i += 8) {
|
||||
*(uint64_t*)(b_output + i) = state[i / 8];
|
||||
}
|
||||
}
|
||||
|
||||
template <int C, int D>
|
||||
cudaError_t
|
||||
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
|
||||
{
|
||||
CHK_INIT_IF_RETURN();
|
||||
cudaStream_t& stream = config.ctx.stream;
|
||||
|
||||
uint8_t* input_device;
|
||||
if (config.are_inputs_on_device) {
|
||||
input_device = input;
|
||||
} else {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&input_device, number_of_blocks * input_block_size, stream));
|
||||
CHK_IF_RETURN(
|
||||
cudaMemcpyAsync(input_device, input, number_of_blocks * input_block_size, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
uint8_t* output_device;
|
||||
if (config.are_outputs_on_device) {
|
||||
output_device = output;
|
||||
} else {
|
||||
CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_blocks * (D / 8), stream));
|
||||
}
|
||||
|
||||
int number_of_threads = 1024;
|
||||
int number_of_gpu_blocks = (number_of_blocks - 1) / number_of_threads + 1;
|
||||
keccak_hash_blocks<C, D><<<number_of_gpu_blocks, number_of_threads, 0, stream>>>(
|
||||
input_device, input_block_size, number_of_blocks, output_device);
|
||||
|
||||
if (!config.are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(input_device, stream));
|
||||
|
||||
if (!config.are_outputs_on_device) {
|
||||
CHK_IF_RETURN(cudaMemcpyAsync(output, output_device, number_of_blocks * (D / 8), cudaMemcpyDeviceToHost, stream));
|
||||
CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
|
||||
}
|
||||
|
||||
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
|
||||
return CHK_LAST();
|
||||
}
|
||||
|
||||
extern "C" cudaError_t
|
||||
Keccak256(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
|
||||
{
|
||||
return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
|
||||
}
|
||||
|
||||
extern "C" cudaError_t
|
||||
Keccak512(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
|
||||
{
|
||||
return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
|
||||
}
|
||||
} // namespace keccak
|
||||
56
icicle/appUtils/keccak/keccak.cuh
Normal file
56
icicle/appUtils/keccak/keccak.cuh
Normal file
@@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
#ifndef KECCAK_H
|
||||
#define KECCAK_H
|
||||
|
||||
#include <cstdint>
|
||||
#include "utils/device_context.cuh"
|
||||
#include "utils/error_handler.cuh"
|
||||
|
||||
namespace keccak {
|
||||
/**
|
||||
* @struct KeccakConfig
|
||||
* Struct that encodes various Keccak parameters.
|
||||
*/
|
||||
struct KeccakConfig {
|
||||
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
|
||||
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
|
||||
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
|
||||
bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
|
||||
* non-blocking and you'd need to synchronize it explicitly by running
|
||||
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
|
||||
* function will block the current CPU thread. */
|
||||
};
|
||||
|
||||
KeccakConfig default_keccak_config()
|
||||
{
|
||||
device_context::DeviceContext ctx = device_context::get_default_device_context();
|
||||
KeccakConfig config = {
|
||||
ctx, // ctx
|
||||
false, // are_inputes_on_device
|
||||
false, // are_outputs_on_device
|
||||
false, // is_async
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the keccak hash over a sequence of preimages.
|
||||
* Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
|
||||
* / 64} u64
|
||||
* @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
|
||||
* @tparam D - number of bits of output. Only multiples of 64 are supported.
|
||||
* @param input a pointer to the input data. May be allocated on device or on host, regulated
|
||||
* by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
|
||||
* number_of_blocks)}.
|
||||
* @param input_block_size - size of each input block in bytes. Should be divisible by 8.
|
||||
* @param number_of_blocks number of input and output blocks. One GPU thread processes one block
|
||||
* @param output a pointer to the output data. May be allocated on device or on host, regulated
|
||||
* by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
|
||||
* number_of_blocks)}
|
||||
*/
|
||||
template <int C, int D>
|
||||
cudaError_t
|
||||
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
|
||||
} // namespace keccak
|
||||
|
||||
#endif
|
||||
67
icicle/appUtils/keccak/test.cu
Normal file
67
icicle/appUtils/keccak/test.cu
Normal file
@@ -0,0 +1,67 @@
|
||||
#include "utils/device_context.cuh"
|
||||
#include "keccak.cu"
|
||||
|
||||
// #define DEBUG
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
using namespace keccak;
|
||||
|
||||
#define D 256
|
||||
|
||||
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
|
||||
#define END_TIMER(timer, msg) \
|
||||
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
|
||||
|
||||
void uint8ToHexString(const uint8_t* values, int size)
|
||||
{
|
||||
std::stringstream ss;
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i];
|
||||
}
|
||||
|
||||
std::string hexString = ss.str();
|
||||
std::cout << hexString << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
|
||||
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
|
||||
|
||||
START_TIMER(allocation_timer);
|
||||
// Prepare input data of [0, 1, 2 ... (number_of_blocks * input_block_size) - 1]
|
||||
int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
|
||||
int input_block_size = argc > 2 ? atoi(argv[2]) : 136;
|
||||
|
||||
uint8_t* in_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * input_block_size));
|
||||
for (uint64_t i = 0; i < number_of_blocks * input_block_size; i++) {
|
||||
in_ptr[i] = (uint8_t)i;
|
||||
}
|
||||
|
||||
END_TIMER(allocation_timer, "Allocate mem and fill input");
|
||||
|
||||
uint8_t* out_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * (D / 8)));
|
||||
|
||||
START_TIMER(keccak_timer);
|
||||
KeccakConfig config = default_keccak_config();
|
||||
Keccak256(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
|
||||
END_TIMER(keccak_timer, "Keccak")
|
||||
|
||||
for (int i = 0; i < number_of_blocks; i++) {
|
||||
#ifdef DEBUG
|
||||
uint8ToHexString(out_ptr + i * (D / 8), D / 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
free(in_ptr);
|
||||
free(out_ptr);
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user