diff --git a/include/linear_algebra.h b/include/linear_algebra.h index 375a26d98..4fd7ec80d 100644 --- a/include/linear_algebra.h +++ b/include/linear_algebra.h @@ -27,6 +27,14 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, void *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); +void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *plaintext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *plaintext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); } #endif // CUDA_LINALG_H_ diff --git a/src/addition.cu b/src/addition.cu index bac856364..fcaa669f0 100644 --- a/src/addition.cu +++ b/src/addition.cu @@ -24,3 +24,25 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } +void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *plaintext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_addition_plaintext(v_stream, gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(plaintext_array_in), + input_lwe_dimension, input_lwe_ciphertext_count); +} +void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *plaintext_array_in, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_addition_plaintext(v_stream, gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(plaintext_array_in), + input_lwe_dimension, input_lwe_ciphertext_count); +} diff --git a/src/addition.cuh b/src/addition.cuh index be4ea3d0e..90383d6e7 100644 --- a/src/addition.cuh +++ b/src/addition.cuh @@ -7,21 +7,38 @@ #include #endif +#include "../include/helper_cuda.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" +#include template __global__ void addition(T *output, T *input_1, T *input_2, uint32_t num_entries) { int tid = threadIdx.x; - if (tid < num_entries) { - int index = blockIdx.x * blockDim.x + tid; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { // Here we take advantage of the wrapping behaviour of uint output[index] = input_1[index] + input_2[index]; } } +template +__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input, + uint32_t input_lwe_dimension, + uint32_t num_entries) { + + int tid = threadIdx.x; + int plaintext_index = blockIdx.x * blockDim.x + tid; + if (plaintext_index < num_entries) { + int index = + plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension; + // Here we take advantage of the wrapping behaviour of uint + output[index] = lwe_input[index] + plaintext_input[plaintext_index]; + } +} + template __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, T *input_1, T *input_2, @@ -45,4 +62,29 @@ __host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, cudaStreamSynchronize(*stream); } +template +__host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index, + T *output, T *lwe_input, + T *plaintext_input, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + auto stream = static_cast(v_stream); + + checkCudaErrors(cudaMemcpyAsync(output, lwe_input, + (input_lwe_dimension + 1) * + input_lwe_ciphertext_count * sizeof(T), + cudaMemcpyDeviceToDevice, *stream)); + plaintext_addition<<>>( + output, lwe_input, plaintext_input, input_lwe_dimension, num_entries); + + cudaStreamSynchronize(*stream); +} #endif // CUDA_ADD_H diff --git a/src/negation.cuh b/src/negation.cuh index 21ef91c74..7ab0e5e51 100644 --- a/src/negation.cuh +++ b/src/negation.cuh @@ -14,8 +14,8 @@ template __global__ void negation(T *output, T *input, uint32_t num_entries) { int tid = threadIdx.x; - if (tid < num_entries) { - int index = blockIdx.x * blockDim.x + tid; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { // Here we take advantage of the wrapping behaviour of uint output[index] = -input[index]; }