mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
feat(cuda): implement LWE ciphertext addition on GPU
This commit is contained in:
32
include/linear_algebra.h
Normal file
32
include/linear_algebra.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef CUDA_LINALG_H_
|
||||
#define CUDA_LINALG_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
#endif // CUDA_LINALG_H_
|
||||
@@ -1,20 +0,0 @@
|
||||
#ifndef CUDA_NEGATE_H_
|
||||
#define CUDA_NEGATE_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
#endif // CUDA_NEGATE_H_
|
||||
@@ -1,6 +1,6 @@
|
||||
set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h)
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h)
|
||||
file(GLOB SOURCES
|
||||
"*.cu"
|
||||
"*.h"
|
||||
|
||||
26
src/addition.cu
Normal file
26
src/addition.cu
Normal file
@@ -0,0 +1,26 @@
|
||||
#include "addition.cuh"
|
||||
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
48
src/addition.cuh
Normal file
48
src/addition.cuh
Normal file
@@ -0,0 +1,48 @@
|
||||
#ifndef CUDA_ADD_H
|
||||
#define CUDA_ADD_H
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename T>
|
||||
__global__ void addition(T *output, T *input_1, T *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (tid < num_entries) {
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = input_1[index] + input_2[index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
|
||||
T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_ADD_H
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "negation.cuh"
|
||||
#include "negation.h"
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
#ifndef CUDA_NEGATE_H
|
||||
#define CUDA_NEGATE_H
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef CUDA_NEGATE
|
||||
#define CUDA_NEGATE
|
||||
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
|
||||
template <typename T>
|
||||
__global__ void negation(T *output, T *input, uint32_t num_entries) {
|
||||
@@ -43,4 +43,4 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
|
||||
cudaStreamSynchronize(*stream);
|
||||
}
|
||||
|
||||
#endif // CUDA_NEGATE
|
||||
#endif // CUDA_NEGATE_H
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
#include "math_functions.h"
|
||||
|
||||
__host__ __device__ int nextPow2(int x) {
|
||||
inline int nextPow2(int x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
@@ -10,10 +8,9 @@ __host__ __device__ int nextPow2(int x) {
|
||||
return ++x;
|
||||
}
|
||||
|
||||
__host__ __device__ void getNumBlocksAndThreads(const int n,
|
||||
const int maxBlockSize,
|
||||
int &blocks, int &threads) {
|
||||
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
|
||||
int &blocks, int &threads) {
|
||||
threads =
|
||||
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user