feat(cuda): implement LWE ciphertext addition on GPU

This commit is contained in:
Agnes Leroy
2022-10-27 11:04:27 +02:00
committed by Agnès Leroy
parent bc66816341
commit d34f53b7ee
8 changed files with 116 additions and 34 deletions

32
include/linear_algebra.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_
#include <cstdint>
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_LINALG_H_

View File

@@ -1,20 +0,0 @@
#ifndef CUDA_NEGATE_H_
#define CUDA_NEGATE_H_
#include <cstdint>
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_NEGATE_H_

View File

@@ -1,6 +1,6 @@
set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h)
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h)
file(GLOB SOURCES
"*.cu"
"*.h"

26
src/addition.cu Normal file
View File

@@ -0,0 +1,26 @@
#include "addition.cuh"
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}

48
src/addition.cuh Normal file
View File

@@ -0,0 +1,48 @@
#ifndef CUDA_ADD_H
#define CUDA_ADD_H
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#include <helper_cuda.h>
#endif
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
template <typename T>
__global__ void addition(T *output, T *input_1, T *input_2,
uint32_t num_entries) {
int tid = threadIdx.x;
if (tid < num_entries) {
int index = blockIdx.x * blockDim.x + tid;
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] + input_2[index];
}
}
template <typename T>
__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
T *input_1, T *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
auto stream = static_cast<cudaStream_t *>(v_stream);
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
cudaStreamSynchronize(*stream);
}
#endif // CUDA_ADD_H

View File

@@ -1,5 +1,4 @@
#include "negation.cuh"
#include "negation.h"
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,

View File

@@ -1,14 +1,14 @@
#ifndef CUDA_NEGATE_H
#define CUDA_NEGATE_H
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#include <helper_cuda.h>
#endif
#ifndef CUDA_NEGATE
#define CUDA_NEGATE
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <thread>
template <typename T>
__global__ void negation(T *output, T *input, uint32_t num_entries) {
@@ -43,4 +43,4 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
cudaStreamSynchronize(*stream);
}
#endif // CUDA_NEGATE
#endif // CUDA_NEGATE_H

View File

@@ -1,6 +1,4 @@
#include "math_functions.h"
__host__ __device__ int nextPow2(int x) {
inline int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
@@ -10,10 +8,9 @@ __host__ __device__ int nextPow2(int x) {
return ++x;
}
__host__ __device__ void getNumBlocksAndThreads(const int n,
const int maxBlockSize,
int &blocks, int &threads) {
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
int &blocks, int &threads) {
threads =
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}
}