diff --git a/include/linear_algebra.h b/include/linear_algebra.h new file mode 100644 index 000000000..375a26d98 --- /dev/null +++ b/include/linear_algebra.h @@ -0,0 +1,32 @@ +#ifndef CUDA_LINALG_H_ +#define CUDA_LINALG_H_ + +#include + +extern "C" { + +void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in_1, + void *lwe_array_in_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in_1, + void *lwe_array_in_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +} + +#endif // CUDA_LINALG_H_ diff --git a/include/negation.h b/include/negation.h deleted file mode 100644 index 5be088a00..000000000 --- a/include/negation.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef CUDA_NEGATE_H_ -#define CUDA_NEGATE_H_ - -#include - -extern "C" { - -void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, - void *lwe_array_out, - void *lwe_array_in, - uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); -void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, - void *lwe_array_out, - void *lwe_array_in, - uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); -} - -#endif // CUDA_NEGATE_H_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33f7a144b..f7d0d1d1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h) + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h) file(GLOB SOURCES "*.cu" "*.h" diff --git a/src/addition.cu b/src/addition.cu new file mode 100644 index 000000000..bac856364 --- /dev/null +++ b/src/addition.cu @@ -0,0 +1,26 @@ +#include "addition.cuh" + +void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in_1, + void *lwe_array_in_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_addition(v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), input_lwe_dimension, + input_lwe_ciphertext_count); +} +void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in_1, + void *lwe_array_in_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_addition(v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), input_lwe_dimension, + input_lwe_ciphertext_count); +} diff --git a/src/addition.cuh b/src/addition.cuh new file mode 100644 index 000000000..be4ea3d0e --- /dev/null +++ b/src/addition.cuh @@ -0,0 +1,48 @@ +#ifndef CUDA_ADD_H +#define CUDA_ADD_H + +#ifdef __CDT_PARSER__ +#undef __CUDA_RUNTIME_H__ +#include +#include +#endif + +#include "linear_algebra.h" +#include "utils/kernel_dimensions.cuh" + +template +__global__ void addition(T *output, T *input_1, T *input_2, + uint32_t num_entries) { + + int tid = threadIdx.x; + if (tid < num_entries) { + int index = blockIdx.x * blockDim.x + tid; + // Here we take advantage of the wrapping behaviour of uint + output[index] = input_1[index] + input_2[index]; + } +} + +template +__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output, + T *input_1, T *input_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + auto stream = static_cast(v_stream); + addition<<>>(output, input_1, input_2, num_entries); + + cudaStreamSynchronize(*stream); +} + +#endif // CUDA_ADD_H diff --git a/src/negation.cu b/src/negation.cu index 054465c1e..bb5ebcf6f 100644 --- a/src/negation.cu +++ b/src/negation.cu @@ -1,5 +1,4 @@ #include "negation.cuh" -#include "negation.h" void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, void *lwe_array_out, diff --git a/src/negation.cuh b/src/negation.cuh index 17647af3d..21ef91c74 100644 --- a/src/negation.cuh +++ b/src/negation.cuh @@ -1,14 +1,14 @@ +#ifndef CUDA_NEGATE_H +#define CUDA_NEGATE_H + #ifdef __CDT_PARSER__ #undef __CUDA_RUNTIME_H__ #include #include #endif -#ifndef CUDA_NEGATE -#define CUDA_NEGATE - +#include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" -#include template __global__ void negation(T *output, T *input, uint32_t num_entries) { @@ -43,4 +43,4 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output, cudaStreamSynchronize(*stream); } -#endif // CUDA_NEGATE +#endif // CUDA_NEGATE_H diff --git a/src/utils/kernel_dimensions.cuh b/src/utils/kernel_dimensions.cuh index 367ec6f33..f7ac8aa15 100644 --- a/src/utils/kernel_dimensions.cuh +++ b/src/utils/kernel_dimensions.cuh @@ -1,6 +1,4 @@ -#include "math_functions.h" - -__host__ __device__ int nextPow2(int x) { +inline int nextPow2(int x) { --x; x |= x >> 1; x |= x >> 2; @@ -10,10 +8,9 @@ __host__ __device__ int nextPow2(int x) { return ++x; } -__host__ __device__ void getNumBlocksAndThreads(const int n, - const int maxBlockSize, - int &blocks, int &threads) { +inline void getNumBlocksAndThreads(const int n, const int maxBlockSize, + int &blocks, int &threads) { threads = (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize; blocks = (n + threads - 1) / threads; -} +} \ No newline at end of file