feat(cuda): implement LWE ciphertext addition on GPU

2026-02-08 19:44:57 -05:00 · 2022-10-27 11:04:27 +02:00
parent bc66816341
commit d34f53b7ee
8 changed files with 116 additions and 34 deletions
--- a/include/linear_algebra.h
+++ b/include/linear_algebra.h
@@ -0,0 +1,32 @@
+#ifndef CUDA_LINALG_H_
+#define CUDA_LINALG_H_
+
+#include <cstdint>
+
+extern "C" {
+
+void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count);
+}
+
+#endif // CUDA_LINALG_H_
--- a/include/negation.h
+++ b/include/negation.h
@@ -1,20 +0,0 @@
-#ifndef CUDA_NEGATE_H_
-#define CUDA_NEGATE_H_
-
-#include <cstdint>
-
-extern "C" {
-
-void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-}
-
-#endif // CUDA_NEGATE_H_
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h)
+        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h)
 file(GLOB SOURCES
     "*.cu"
     "*.h"
--- a/src/addition.cu
+++ b/src/addition.cu
@@ -0,0 +1,26 @@
+#include "addition.cuh"
+
+void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count) {
+
+  host_addition(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
+                static_cast<uint32_t *>(lwe_array_in_1),
+                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
+void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count) {
+
+  host_addition(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+                static_cast<uint64_t *>(lwe_array_in_1),
+                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
--- a/src/addition.cuh
+++ b/src/addition.cuh
@@ -0,0 +1,48 @@
+#ifndef CUDA_ADD_H
+#define CUDA_ADD_H
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#endif
+
+#include "linear_algebra.h"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename T>
+__global__ void addition(T *output, T *input_1, T *input_2,
+                         uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  if (tid < num_entries) {
+    int index = blockIdx.x * blockDim.x + tid;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] + input_2[index];
+  }
+}
+
+template <typename T>
+__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
+                            T *input_1, T *input_2,
+                            uint32_t input_lwe_dimension,
+                            uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
+
+  cudaStreamSynchronize(*stream);
+}
+
+#endif // CUDA_ADD_H
--- a/src/negation.cu
+++ b/src/negation.cu
@@ -1,5 +1,4 @@
 #include "negation.cuh"
-#include "negation.h"

 void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
                                          void *lwe_array_out,
--- a/src/negation.cuh
+++ b/src/negation.cuh
@@ -1,14 +1,14 @@
+#ifndef CUDA_NEGATE_H
+#define CUDA_NEGATE_H
+
 #ifdef __CDT_PARSER__
 #undef __CUDA_RUNTIME_H__
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #endif

-#ifndef CUDA_NEGATE
-#define CUDA_NEGATE
-
+#include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
-#include <thread>

 template <typename T>
 __global__ void negation(T *output, T *input, uint32_t num_entries) {
@@ -43,4 +43,4 @@ __host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
  cudaStreamSynchronize(*stream);
 }

-#endif // CUDA_NEGATE
+#endif // CUDA_NEGATE_H
--- a/src/utils/kernel_dimensions.cuh
+++ b/src/utils/kernel_dimensions.cuh
@@ -1,6 +1,4 @@
-#include "math_functions.h"
-
-__host__ __device__ int nextPow2(int x) {
+inline int nextPow2(int x) {
  --x;
  x |= x >> 1;
  x |= x >> 2;
@@ -10,10 +8,9 @@ __host__ __device__ int nextPow2(int x) {
  return ++x;
 }

-__host__ __device__ void getNumBlocksAndThreads(const int n,
-                                                const int maxBlockSize,
-                                                int &blocks, int &threads) {
+inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
+                                   int &blocks, int &threads) {
  threads =
      (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
  blocks = (n + threads - 1) / threads;
-}
+}