feat(cuda): implement lwe ciphertext / plaintext add in concrete-cuda

This commit is contained in:
Agnes Leroy
2022-11-07 16:00:46 +01:00
committed by Agnès Leroy
parent b560ae6a72
commit 13e77b2d8c
4 changed files with 76 additions and 4 deletions

View File

@@ -24,3 +24,25 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(v_stream, gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(v_stream, gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -7,21 +7,38 @@
#include <helper_cuda.h>
#endif
#include "../include/helper_cuda.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
template <typename T>
__global__ void addition(T *output, T *input_1, T *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
if (tid < num_entries) {
int index = blockIdx.x * blockDim.x + tid;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] + input_2[index];
}
}
template <typename T>
__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
if (plaintext_index < num_entries) {
int index =
plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] + plaintext_input[plaintext_index];
}
}
template <typename T>
__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
T *input_1, T *input_2,
@@ -45,4 +62,29 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
cudaStreamSynchronize(*stream);
}
template <typename T>
__host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
T *output, T *lwe_input,
T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
checkCudaErrors(cudaMemcpyAsync(output, lwe_input,
(input_lwe_dimension + 1) *
input_lwe_ciphertext_count * sizeof(T),
cudaMemcpyDeviceToDevice, *stream));
plaintext_addition<<<grid, thds, 0, *stream>>>(
output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
cudaStreamSynchronize(*stream);
}
#endif // CUDA_ADD_H

View File

@@ -14,8 +14,8 @@ template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
int tid = threadIdx.x;
if (tid < num_entries) {
int index = blockIdx.x * blockDim.x + tid;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = -input[index];
}