blake2s working for single hash using Hasher

2026-01-09 15:37:58 -05:00 · 2024-08-04 16:17:09 +03:00
parent 256f8fa1d9
commit 0180185c25
5 changed files with 197 additions and 12 deletions
--- a/icicle/include/hash/blake2s/blake2s.cuh
+++ b/icicle/include/hash/blake2s/blake2s.cuh
@@ -24,6 +24,31 @@
 using namespace hash;

 namespace blake2s{
+    #define BLAKE2S_ROUNDS 10
+    #define BLAKE2S_BLOCK_LENGTH 64
+    #define BLAKE2S_CHAIN_SIZE 8
+    #define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
+    #define BLAKE2S_STATE_SIZE 16
+    #define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
+
+    class Blake2s : public Hasher<BYTE, BYTE>
+    {
+    public:
+      cudaError_t run_hash_many_kernel(
+        const BYTE* input,
+        BYTE*       output,
+        WORD        number_of_states,
+        WORD        input_len,
+        WORD        output_len,
+        const device_context::DeviceContext& ctx) const override;
+        
+  
+      Blake2s()
+          : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0)
+      {
+      }
+    };
+
 extern "C" {
 void mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE * in, WORD inlen, BYTE * out, WORD n_outbit, WORD n_batch);
 }
--- a/icicle/src/hash/blake2s/Makefile
+++ b/icicle/src/hash/blake2s/Makefile
@@ -1,2 +1,5 @@
 test_blake2s: test_blake2s.cu blake2s.cu 
-	nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
+	nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
+
+test_blake2s_hasher: test_blake2s_hasher.cu blake2s.cu 
+	nvcc -o test_blake2s_hasher -I. -I../../../include test_blake2s_hasher.cu -g
--- a/icicle/src/hash/blake2s/blake2s.cu
+++ b/icicle/src/hash/blake2s/blake2s.cu
@@ -11,12 +11,7 @@
 using namespace hash;

 namespace blake2s {
-#define BLAKE2S_ROUNDS 10
-#define BLAKE2S_BLOCK_LENGTH 64
-#define BLAKE2S_CHAIN_SIZE 8
-#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
-#define BLAKE2S_STATE_SIZE 16
-#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
+

 typedef struct {
    WORD digestlen;
@@ -82,7 +77,7 @@ __constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
    { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }
 };

-__device__ uint32_t cuda_blake2s_leuint32(BYTE *in) {
+__device__ uint32_t cuda_blake2s_leuint32(const BYTE *in) {
    uint32_t a;
    memcpy(&a, in, 4);
    return a;
@@ -122,7 +117,7 @@ __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t *ctx)
    // ctx->state[14] = BLAKE2S_IVS[7];
 }

-__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t *ctx, BYTE *in, WORD inoffset) {
+__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t *ctx, const BYTE *in, WORD inoffset) {
    cuda_blake2s_init_state(ctx);
    uint32_t m[16] = { 0 };
    for (int j = 0; j < 16; j++)
@@ -167,7 +162,7 @@ __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t *ctx, BYTE *key, WORD keyle
    ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
 }

-__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t *ctx, BYTE *in, LONG inlen) {
+__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t *ctx, const BYTE *in, LONG inlen) {
    if (inlen == 0)
        return;

@@ -225,12 +220,12 @@ __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t *ctx, BYTE *out) {
    }
 }

-__global__ void kernel_blake2s_hash(BYTE *indata, WORD inlen, BYTE *outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) {
+__global__ void kernel_blake2s_hash(const BYTE *indata, WORD inlen, BYTE *outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) {
    WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
    if (thread >= n_batch) {
        return;
    }
-    BYTE *in = indata + thread * inlen;
+    const BYTE *in = indata + thread * inlen;
    BYTE *out = outdata + thread * BLAKE2S_BLOCK_SIZE;
    CUDA_BLAKE2S_CTX ctx = c_CTX;
    cuda_blake2s_update(&ctx, in, inlen);
@@ -265,4 +260,36 @@ void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, B
    cudaFree(cuda_outdata);
 }
 }
+
+cudaError_t Blake2s::run_hash_many_kernel(
+    const BYTE* input,
+    BYTE*       output,
+    WORD        number_of_states,
+    WORD        input_len,
+    WORD        output_len,
+    const device_context::DeviceContext& ctx) const
+    {
+        BYTE key[32] = "";  // Null key
+        WORD keylen = strlen((char *)key);
+        const WORD BLAKE2S_BLOCK_SIZE = (output_len >> 3);
+        CUDA_BLAKE2S_CTX blake_ctx;
+        cpu_blake2s_init(&blake_ctx, key, keylen, output_len);
+        cudaMemcpyToSymbol(c_CTX, &blake_ctx, sizeof(CUDA_BLAKE2S_CTX), 0, cudaMemcpyHostToDevice);
+    
+        WORD thread = 256;
+        WORD block = (number_of_states + thread - 1) / thread;
+
+        kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
+        // cudaDeviceSynchronize();
+        // cudaError_t error = cudaGetLastError();
+        // if (error != cudaSuccess) {
+        //     printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error));
+        // }
+
+        CHK_IF_RETURN(cudaPeekAtLastError());
+        return CHK_LAST();
+
+    }
+
+
 } // namespace blake2s
--- a/icicle/src/hash/blake2s/extern.cu
+++ b/icicle/src/hash/blake2s/extern.cu
@@ -0,0 +1,19 @@
+#include "utils/utils.h"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+
+#include "hash/blake2s/blake2s.cuh"
+#include "blake2s.cu"
+#include "../../merkle-tree/merkle.cu"
+#include "merkle-tree/merkle.cuh"
+
+namespace blake2s {
+  extern "C" cudaError_t
+  blake2s_cuda(BYTE * input, BYTE * output,  WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config)
+  {
+    return Blake2s().hash_many(
+      input, output, number_of_blocks, input_block_size, output_block_size, config);
+  }
+
+} // namespace blake2s
--- a/icicle/src/hash/blake2s/test_blake2s_hasher.cu
+++ b/icicle/src/hash/blake2s/test_blake2s_hasher.cu
@@ -0,0 +1,111 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include "extern.cu"
+
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+extern "C" {
+void mcm_cuda_blake2s_hash_batch(BYTE *key, WORD keylen, BYTE *in, WORD inlen, BYTE *out, WORD n_outbit, WORD n_batch);
+}
+
+void print_hash(BYTE *hash, WORD len) {
+    printf("Hash Len: %d \n", len);
+    printf("BLAKE2S hash:\n");
+    for (WORD i = 0; i < len; i++) {
+        printf("%02x", hash[i]);
+    }
+    printf("\n");
+}
+
+BYTE *read_file(const char *filename, size_t *filesize) {
+    FILE *file = fopen(filename, "rb");
+    if (!file) {
+        perror("Failed to open file");
+        exit(EXIT_FAILURE);
+    }
+
+    fseek(file, 0, SEEK_END);
+    *filesize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    BYTE *buffer = (BYTE *)malloc(*filesize);
+    if (!buffer) {
+        perror("Failed to allocate memory");
+        fclose(file);
+        exit(EXIT_FAILURE);
+    }
+
+    size_t bytesRead = fread(buffer, 1, *filesize, file);
+    if (bytesRead != *filesize) {
+        perror("Failed to read file");
+        free(buffer);
+        fclose(file);
+        exit(EXIT_FAILURE);
+    }
+
+    fclose(file);
+    return buffer;
+}
+
+int main(int argc, char **argv) {
+    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+    using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+    BYTE *input;
+    size_t inlen;
+    const char *input_filename;
+    const char *default_input = "aaaaaaaaaaa";
+
+    if (argc < 2) {
+        // Use default input if no file is provided
+        input = (BYTE *)default_input;
+        inlen = strlen(default_input);
+    } else {
+        input_filename = argv[1];
+        input = read_file(input_filename, &inlen);
+    }
+
+    // Test parameters
+    BYTE key[32] = "";  // Example key
+    WORD keylen = strlen((char *)key);
+    WORD n_outbit = 256;  // Output length in bits
+    WORD n_batch = 1;  // Number of hashes to compute in parallel
+
+    // Allocate memory for the output
+    WORD outlen = n_outbit / 8;
+    BYTE *output = (BYTE *)malloc(outlen * n_batch);
+    if (!output) {
+        perror("Failed to allocate memory for output");
+        if (argc >= 2) free(input);  // Free file buffer if it was allocated
+        return EXIT_FAILURE;
+    }
+
+    printf("Key len: %d \n", keylen);
+    
+    // Perform the hashing
+    START_TIMER(blake_timer)
+    HashConfig config = default_hash_config();
+
+    blake2s_cuda(input, output, n_batch, inlen, n_outbit, config);
+    END_TIMER(blake_timer, "Blake Timer")
+    
+    // Print the result
+    print_hash(output, outlen);
+
+    // Clean up
+    free(output);
+    if (argc >= 2) free(input);  // Free file buffer if it was allocated
+    return 0;
+}
+