diff --git a/include/negation.h b/include/negation.h new file mode 100644 index 000000000..5be088a00 --- /dev/null +++ b/include/negation.h @@ -0,0 +1,20 @@ +#ifndef CUDA_NEGATE_H_ +#define CUDA_NEGATE_H_ + +#include + +extern "C" { + +void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count); +} + +#endif // CUDA_NEGATE_H_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5260a879e..33f7a144b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,6 @@ -set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h) +set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h + ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h) file(GLOB SOURCES "*.cu" "*.h" diff --git a/src/negation.cu b/src/negation.cu new file mode 100644 index 000000000..054465c1e --- /dev/null +++ b/src/negation.cu @@ -0,0 +1,23 @@ +#include "negation.cuh" +#include "negation.h" + +void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_negation(v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); +} +void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, + void *lwe_array_out, + void *lwe_array_in, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + host_negation(v_stream, gpu_index, static_cast(lwe_array_out), + static_cast(lwe_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); +} diff --git a/src/negation.cuh b/src/negation.cuh new file mode 100644 index 000000000..17647af3d --- /dev/null +++ b/src/negation.cuh @@ -0,0 +1,46 @@ +#ifdef __CDT_PARSER__ +#undef __CUDA_RUNTIME_H__ +#include +#include +#endif + +#ifndef CUDA_NEGATE +#define CUDA_NEGATE + +#include "utils/kernel_dimensions.cuh" +#include + +template +__global__ void negation(T *output, T *input, uint32_t num_entries) { + + int tid = threadIdx.x; + if (tid < num_entries) { + int index = blockIdx.x * blockDim.x + tid; + // Here we take advantage of the wrapping behaviour of uint + output[index] = -input[index]; + } +} + +template +__host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output, + T *input, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + auto stream = static_cast(v_stream); + negation<<>>(output, input, num_entries); + + cudaStreamSynchronize(*stream); +} + +#endif // CUDA_NEGATE diff --git a/src/utils/kernel_dimensions.cuh b/src/utils/kernel_dimensions.cuh index afd7e008d..367ec6f33 100644 --- a/src/utils/kernel_dimensions.cuh +++ b/src/utils/kernel_dimensions.cuh @@ -1,4 +1,6 @@ -int nextPow2(int x) { +#include "math_functions.h" + +__host__ __device__ int nextPow2(int x) { --x; x |= x >> 1; x |= x >> 2; @@ -8,8 +10,10 @@ int nextPow2(int x) { return ++x; } -void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks, - int &threads) { - threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize; +__host__ __device__ void getNumBlocksAndThreads(const int n, + const int maxBlockSize, + int &blocks, int &threads) { + threads = + (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize; blocks = (n + threads - 1) / threads; }