blake2s working for single hash using Hasher

This commit is contained in:
aviadingo
2024-08-04 16:17:09 +03:00
parent 256f8fa1d9
commit 0180185c25
5 changed files with 197 additions and 12 deletions

View File

@@ -24,6 +24,31 @@
using namespace hash;
namespace blake2s{
#define BLAKE2S_ROUNDS 10
#define BLAKE2S_BLOCK_LENGTH 64
#define BLAKE2S_CHAIN_SIZE 8
#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
#define BLAKE2S_STATE_SIZE 16
#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
class Blake2s : public Hasher<BYTE, BYTE>
{
public:
cudaError_t run_hash_many_kernel(
const BYTE* input,
BYTE* output,
WORD number_of_states,
WORD input_len,
WORD output_len,
const device_context::DeviceContext& ctx) const override;
Blake2s()
: Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0)
{
}
};
extern "C" {
void mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch);
}

View File

@@ -1,2 +1,5 @@
test_blake2s: test_blake2s.cu blake2s.cu
nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
test_blake2s_hasher: test_blake2s_hasher.cu blake2s.cu
nvcc -o test_blake2s_hasher -I. -I../../../include test_blake2s_hasher.cu -g

View File

@@ -11,12 +11,7 @@
using namespace hash;
namespace blake2s {
#define BLAKE2S_ROUNDS 10
#define BLAKE2S_BLOCK_LENGTH 64
#define BLAKE2S_CHAIN_SIZE 8
#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
#define BLAKE2S_STATE_SIZE 16
#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
typedef struct {
WORD digestlen;
@@ -82,7 +77,7 @@ __constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }
};
__device__ uint32_t cuda_blake2s_leuint32(BYTE *in) {
__device__ uint32_t cuda_blake2s_leuint32(const BYTE *in) {
uint32_t a;
memcpy(&a, in, 4);
return a;
@@ -122,7 +117,7 @@ __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t *ctx)
// ctx->state[14] = BLAKE2S_IVS[7];
}
__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t *ctx, BYTE *in, WORD inoffset) {
__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t *ctx, const BYTE *in, WORD inoffset) {
cuda_blake2s_init_state(ctx);
uint32_t m[16] = { 0 };
for (int j = 0; j < 16; j++)
@@ -167,7 +162,7 @@ __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keyle
ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
}
__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t *ctx, BYTE *in, LONG inlen) {
__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t *ctx, const BYTE *in, LONG inlen) {
if (inlen == 0)
return;
@@ -225,12 +220,12 @@ __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t *ctx, BYTE *out) {
}
}
__global__ void kernel_blake2s_hash(BYTE *indata, WORD inlen, BYTE *outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) {
__global__ void kernel_blake2s_hash(const BYTE *indata, WORD inlen, BYTE *outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) {
WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
if (thread >= n_batch) {
return;
}
BYTE *in = indata + thread * inlen;
const BYTE *in = indata + thread * inlen;
BYTE *out = outdata + thread * BLAKE2S_BLOCK_SIZE;
CUDA_BLAKE2S_CTX ctx = c_CTX;
cuda_blake2s_update(&ctx, in, inlen);
@@ -265,4 +260,36 @@ void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, B
cudaFree(cuda_outdata);
}
}
cudaError_t Blake2s::run_hash_many_kernel(
const BYTE* input,
BYTE* output,
WORD number_of_states,
WORD input_len,
WORD output_len,
const device_context::DeviceContext& ctx) const
{
BYTE key[32] = ""; // Null key
WORD keylen = strlen((char *)key);
const WORD BLAKE2S_BLOCK_SIZE = (output_len >> 3);
CUDA_BLAKE2S_CTX blake_ctx;
cpu_blake2s_init(&blake_ctx, key, keylen, output_len);
cudaMemcpyToSymbol(c_CTX, &blake_ctx, sizeof(CUDA_BLAKE2S_CTX), 0, cudaMemcpyHostToDevice);
WORD thread = 256;
WORD block = (number_of_states + thread - 1) / thread;
kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
// cudaDeviceSynchronize();
// cudaError_t error = cudaGetLastError();
// if (error != cudaSuccess) {
// printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error));
// }
CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}
} // namespace blake2s

View File

@@ -0,0 +1,19 @@
#include "utils/utils.h"
#include "gpu-utils/error_handler.cuh"
#include "hash/hash.cuh"
#include "hash/blake2s/blake2s.cuh"
#include "blake2s.cu"
#include "../../merkle-tree/merkle.cu"
#include "merkle-tree/merkle.cuh"
namespace blake2s {
extern "C" cudaError_t
blake2s_cuda(BYTE * input, BYTE * output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config)
{
return Blake2s().hash_many(
input, output, number_of_blocks, input_block_size, output_block_size, config);
}
} // namespace blake2s

View File

@@ -0,0 +1,111 @@
#include <chrono>
#include "gpu-utils/device_context.cuh"
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iomanip>
#include "extern.cu"
using namespace blake2s;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) \
printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
extern "C" {
void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch);
}
void print_hash(BYTE *hash, WORD len) {
printf("Hash Len: %d \n", len);
printf("BLAKE2S hash:\n");
for (WORD i = 0; i < len; i++) {
printf("%02x", hash[i]);
}
printf("\n");
}
BYTE *read_file(const char *filename, size_t *filesize) {
FILE *file = fopen(filename, "rb");
if (!file) {
perror("Failed to open file");
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
*filesize = ftell(file);
fseek(file, 0, SEEK_SET);
BYTE *buffer = (BYTE *)malloc(*filesize);
if (!buffer) {
perror("Failed to allocate memory");
fclose(file);
exit(EXIT_FAILURE);
}
size_t bytesRead = fread(buffer, 1, *filesize, file);
if (bytesRead != *filesize) {
perror("Failed to read file");
free(buffer);
fclose(file);
exit(EXIT_FAILURE);
}
fclose(file);
return buffer;
}
int main(int argc, char **argv) {
using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
BYTE *input;
size_t inlen;
const char *input_filename;
const char *default_input = "aaaaaaaaaaa";
if (argc < 2) {
// Use default input if no file is provided
input = (BYTE *)default_input;
inlen = strlen(default_input);
} else {
input_filename = argv[1];
input = read_file(input_filename, &inlen);
}
// Test parameters
BYTE key[32] = ""; // Example key
WORD keylen = strlen((char *)key);
WORD n_outbit = 256; // Output length in bits
WORD n_batch = 1; // Number of hashes to compute in parallel
// Allocate memory for the output
WORD outlen = n_outbit / 8;
BYTE *output = (BYTE *)malloc(outlen * n_batch);
if (!output) {
perror("Failed to allocate memory for output");
if (argc >= 2) free(input); // Free file buffer if it was allocated
return EXIT_FAILURE;
}
printf("Key len: %d \n", keylen);
// Perform the hashing
START_TIMER(blake_timer)
HashConfig config = default_hash_config();
blake2s_cuda(input, output, n_batch, inlen, n_outbit, config);
END_TIMER(blake_timer, "Blake Timer")
// Print the result
print_hash(output, outlen);
// Clean up
free(output);
if (argc >= 2) free(input); // Free file buffer if it was allocated
return 0;
}