Add Keccak hash function (#435)

This PR adds support for Keccak-256 and Keccak-512. It only adds them in
c++. There is no way of adding rust or golang wrappers rn as it requires
having an `icicle-common` create / mod
This commit is contained in:
ChickenLover
2024-03-21 03:30:19 +07:00
committed by GitHub
parent 7293058246
commit d4f39efea3
4 changed files with 400 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
test_keccak: test.cu keccak.cu
nvcc -o test_keccak -I. -I../.. test.cu

View File

@@ -0,0 +1,275 @@
#include "keccak.cuh"
namespace keccak {
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = ROTL64((d0 ^ d1 ^ d2 ^ d3 ^ d4), 1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}
#define THETA( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; \
s01 ^= t0; \
s02 ^= t0; \
s03 ^= t0; \
s04 ^= t0; \
\
s10 ^= t1; \
s11 ^= t1; \
s12 ^= t1; \
s13 ^= t1; \
s14 ^= t1; \
\
s20 ^= t2; \
s21 ^= t2; \
s22 ^= t2; \
s23 ^= t2; \
s24 ^= t2; \
\
s30 ^= t3; \
s31 ^= t3; \
s32 ^= t3; \
s33 ^= t3; \
s34 ^= t3; \
\
s40 ^= t4; \
s41 ^= t4; \
s42 ^= t4; \
s43 ^= t4; \
s44 ^= t4; \
}
#define RHOPI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = ROTL64(s10, (uint64_t)1); \
s10 = ROTL64(s11, (uint64_t)44); \
s11 = ROTL64(s41, (uint64_t)20); \
s41 = ROTL64(s24, (uint64_t)61); \
s24 = ROTL64(s42, (uint64_t)39); \
s42 = ROTL64(s04, (uint64_t)18); \
s04 = ROTL64(s20, (uint64_t)62); \
s20 = ROTL64(s22, (uint64_t)43); \
s22 = ROTL64(s32, (uint64_t)25); \
s32 = ROTL64(s43, (uint64_t)8); \
s43 = ROTL64(s34, (uint64_t)56); \
s34 = ROTL64(s03, (uint64_t)41); \
s03 = ROTL64(s40, (uint64_t)27); \
s40 = ROTL64(s44, (uint64_t)14); \
s44 = ROTL64(s14, (uint64_t)2); \
s14 = ROTL64(s31, (uint64_t)55); \
s31 = ROTL64(s13, (uint64_t)45); \
s13 = ROTL64(s01, (uint64_t)36); \
s01 = ROTL64(s30, (uint64_t)28); \
s30 = ROTL64(s33, (uint64_t)21); \
s33 = ROTL64(s23, (uint64_t)15); \
s23 = ROTL64(s12, (uint64_t)10); \
s12 = ROTL64(s21, (uint64_t)6); \
s21 = ROTL64(s02, (uint64_t)3); \
s02 = t0; \
}
#define KHI( \
s00, s01, s02, s03, s04, s10, s11, s12, s13, s14, s20, s21, s22, s23, s24, s30, s31, s32, s33, s34, s40, s41, s42, \
s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; \
s10 = t1; \
s20 = t2; \
s30 = t3; \
s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; \
s11 = t1; \
s21 = t2; \
s31 = t3; \
s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; \
s12 = t1; \
s22 = t2; \
s32 = t3; \
s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; \
s13 = t1; \
s23 = t2; \
s33 = t3; \
s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; \
s14 = t1; \
s24 = t2; \
s34 = t3; \
s44 = t4; \
}
#define IOTA(element, rc) \
{ \
element ^= rc; \
}
__device__ const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
__device__ void keccakf(uint64_t s[25])
{
uint64_t t0, t1, t2, t3, t4;
for (int i = 0; i < 24; i++) {
THETA(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
RHOPI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
KHI(
s[0], s[5], s[10], s[15], s[20], s[1], s[6], s[11], s[16], s[21], s[2], s[7], s[12], s[17], s[22], s[3], s[8],
s[13], s[18], s[23], s[4], s[9], s[14], s[19], s[24]);
IOTA(s[0], RC[i]);
}
}
template <int C, int D>
__global__ void keccak_hash_blocks(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output)
{
int bid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (bid >= number_of_blocks) { return; }
const int r_bits = 1600 - C;
const int r_bytes = r_bits / 8;
const int d_bytes = D / 8;
uint8_t* b_input = input + bid * input_block_size;
uint8_t* b_output = output + bid * d_bytes;
uint64_t state[25] = {}; // Initialize with zeroes
int input_len = input_block_size;
// absorb
while (input_len >= r_bytes) {
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(b_input + i);
}
keccakf(state);
b_input += r_bytes;
input_len -= r_bytes;
}
// last block (if any)
uint8_t last_block[r_bytes];
for (int i = 0; i < input_len; i++) {
last_block[i] = b_input[i];
}
// pad 10*1
last_block[input_len] = 1;
for (int i = 0; i < r_bytes - input_len - 1; i++) {
last_block[input_len + i + 1] = 0;
}
// last bit
last_block[r_bytes - 1] |= 0x80;
// #pragma unroll
for (int i = 0; i < r_bytes; i += 8) {
state[i / 8] ^= *(uint64_t*)(last_block + i);
}
keccakf(state);
#pragma unroll
for (int i = 0; i < d_bytes; i += 8) {
*(uint64_t*)(b_output + i) = state[i / 8];
}
}
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
CHK_INIT_IF_RETURN();
cudaStream_t& stream = config.ctx.stream;
uint8_t* input_device;
if (config.are_inputs_on_device) {
input_device = input;
} else {
CHK_IF_RETURN(cudaMallocAsync(&input_device, number_of_blocks * input_block_size, stream));
CHK_IF_RETURN(
cudaMemcpyAsync(input_device, input, number_of_blocks * input_block_size, cudaMemcpyHostToDevice, stream));
}
uint8_t* output_device;
if (config.are_outputs_on_device) {
output_device = output;
} else {
CHK_IF_RETURN(cudaMallocAsync(&output_device, number_of_blocks * (D / 8), stream));
}
int number_of_threads = 1024;
int number_of_gpu_blocks = (number_of_blocks - 1) / number_of_threads + 1;
keccak_hash_blocks<C, D><<<number_of_gpu_blocks, number_of_threads, 0, stream>>>(
input_device, input_block_size, number_of_blocks, output_device);
if (!config.are_inputs_on_device) CHK_IF_RETURN(cudaFreeAsync(input_device, stream));
if (!config.are_outputs_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(output, output_device, number_of_blocks * (D / 8), cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(output_device, stream));
}
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(stream));
return CHK_LAST();
}
extern "C" cudaError_t
Keccak256(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
return keccak_hash<512, 256>(input, input_block_size, number_of_blocks, output, config);
}
extern "C" cudaError_t
Keccak512(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config)
{
return keccak_hash<1024, 512>(input, input_block_size, number_of_blocks, output, config);
}
} // namespace keccak

View File

@@ -0,0 +1,56 @@
#pragma once
#ifndef KECCAK_H
#define KECCAK_H
#include <cstdint>
#include "utils/device_context.cuh"
#include "utils/error_handler.cuh"
namespace keccak {
/**
* @struct KeccakConfig
* Struct that encodes various Keccak parameters.
*/
struct KeccakConfig {
device_context::DeviceContext ctx; /**< Details related to the device such as its id and stream id. */
bool are_inputs_on_device; /**< True if inputs are on device and false if they're on host. Default value: false. */
bool are_outputs_on_device; /**< If true, output is preserved on device, otherwise on host. Default value: false. */
bool is_async; /**< Whether to run the Keccak asynchronously. If set to `true`, the keccak_hash function will be
* non-blocking and you'd need to synchronize it explicitly by running
* `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, keccak_hash
* function will block the current CPU thread. */
};
KeccakConfig default_keccak_config()
{
device_context::DeviceContext ctx = device_context::get_default_device_context();
KeccakConfig config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // is_async
};
return config;
}
/**
* Compute the keccak hash over a sequence of preimages.
* Takes {number_of_blocks * input_block_size} u64s of input and computes {number_of_blocks} outputs, each of size {D
* / 64} u64
* @tparam C - number of bits of capacity (c = b - r = 1600 - r). Only multiples of 64 are supported.
* @tparam D - number of bits of output. Only multiples of 64 are supported.
* @param input a pointer to the input data. May be allocated on device or on host, regulated
* by the config. Must be of size [input_block_size](@ref input_block_size) * [number_of_blocks](@ref
* number_of_blocks)}.
* @param input_block_size - size of each input block in bytes. Should be divisible by 8.
* @param number_of_blocks number of input and output blocks. One GPU thread processes one block
* @param output a pointer to the output data. May be allocated on device or on host, regulated
* by the config. Must be of size [output_block_size](@ref output_block_size) * [number_of_blocks](@ref
* number_of_blocks)}
*/
template <int C, int D>
cudaError_t
keccak_hash(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, KeccakConfig config);
} // namespace keccak
#endif

View File

@@ -0,0 +1,67 @@
#include "utils/device_context.cuh"
#include "keccak.cu"
// #define DEBUG
#ifndef __CUDA_ARCH__
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iomanip>
using namespace keccak;
#define D 256
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
void uint8ToHexString(const uint8_t* values, int size)
{
std::stringstream ss;
for (int i = 0; i < size; ++i) {
ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i];
}
std::string hexString = ss.str();
std::cout << hexString << std::endl;
}
int main(int argc, char* argv[])
{
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
START_TIMER(allocation_timer);
// Prepare input data of [0, 1, 2 ... (number_of_blocks * input_block_size) - 1]
int number_of_blocks = argc > 1 ? 1 << atoi(argv[1]) : 1024;
int input_block_size = argc > 2 ? atoi(argv[2]) : 136;
uint8_t* in_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * input_block_size));
for (uint64_t i = 0; i < number_of_blocks * input_block_size; i++) {
in_ptr[i] = (uint8_t)i;
}
END_TIMER(allocation_timer, "Allocate mem and fill input");
uint8_t* out_ptr = static_cast<uint8_t*>(malloc(number_of_blocks * (D / 8)));
START_TIMER(keccak_timer);
KeccakConfig config = default_keccak_config();
Keccak256(in_ptr, input_block_size, number_of_blocks, out_ptr, config);
END_TIMER(keccak_timer, "Keccak")
for (int i = 0; i < number_of_blocks; i++) {
#ifdef DEBUG
uint8ToHexString(out_ptr + i * (D / 8), D / 8);
#endif
}
free(in_ptr);
free(out_ptr);
}
#endif