From 25f103f62d2df69cbcda8540b9712a869281c7e8 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Mon, 7 Nov 2022 12:45:25 -0300
Subject: [PATCH] feat(cuda): Refactor the low latency PBS to use asynchronous
 allocation.

---
 include/device.h              |  4 ++--
 src/bootstrap_amortized.cuh   |  8 ++++----
 src/bootstrap_low_latency.cuh | 11 +++++------
 src/bootstrap_wop.cuh         | 16 ++++++++--------
 src/device.cu                 | 19 +++++++++++--------
 5 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/include/device.h b/include/device.h
index 4192815d4..f292a5c3e 100644
--- a/include/device.h
+++ b/include/device.h
@@ -7,7 +7,7 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
 
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
 
-void *cuda_malloc_async(uint64_t size, void *v_stream);
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream);
 
 int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 
@@ -28,7 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index);
 
 int cuda_drop(void *ptr, uint32_t gpu_index);
 
-int cuda_drop_async(void *ptr, void *v_stream);
+int cuda_drop_async(void *ptr, cudaStream_t stream);
 
 int cuda_get_max_shared_memory(uint32_t gpu_index);
 }
diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index f67f44d12..641870f87 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -338,7 +338,7 @@ __host__ void host_bootstrap_amortized(
   // from one of three templates (no use, partial use or full use
   // of shared memory)
   if (max_shared_memory < SM_PART) {
-    d_mem = (char*) cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, v_stream);
+    d_mem = (char*) cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, *stream);
     device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
         lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
         bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
@@ -348,7 +348,7 @@ __host__ void host_bootstrap_amortized(
                          cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
     cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
                            cudaFuncCachePreferShared);
-    d_mem = (char*) cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, v_stream);
+    d_mem = (char*) cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, *stream);
     device_bootstrap_amortized<Torus, params, PARTIALSM>
         <<<grid, thds, SM_PART, *stream>>>(
             lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
@@ -366,7 +366,7 @@ __host__ void host_bootstrap_amortized(
     checkCudaErrors(cudaFuncSetCacheConfig(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncCachePreferShared));
-    d_mem = (char*) cuda_malloc_async(0, v_stream);
+    d_mem = (char*) cuda_malloc_async(0, *stream);
 
     device_bootstrap_amortized<Torus, params, FULLSM>
         <<<grid, thds, SM_FULL, *stream>>>(
@@ -377,7 +377,7 @@ __host__ void host_bootstrap_amortized(
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cudaStreamSynchronize(*stream);
-  cuda_drop_async(d_mem, v_stream);
+  cuda_drop_async(d_mem, *stream);
 }
 
 template <typename Torus, class params>
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 9c6206079..f7c127853 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -10,6 +10,7 @@
 #include "cooperative_groups.h"
 
 #include "../include/helper_cuda.h"
+#include "device.h"
 #include "bootstrap.h"
 #include "complex/operations.cuh"
 #include "crypto/gadget.cuh"
@@ -261,10 +262,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out,
 
   int buffer_size_per_gpu =
       level_count * num_samples * polynomial_size / 2 * sizeof(double2);
-  double2 *mask_buffer_fft;
-  double2 *body_buffer_fft;
-  checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
-  checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
+  double2 *mask_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream);
+  double2 *body_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream);
 
   int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp
                      sizeof(Torus) * polynomial_size +   // accumulator
@@ -298,8 +297,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out,
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cudaStreamSynchronize(*stream);
-  cudaFree(mask_buffer_fft);
-  cudaFree(body_buffer_fft);
+  cuda_drop_async(mask_buffer_fft, *stream);
+  cuda_drop_async(body_buffer_fft, *stream);
 }
 
 #endif // LOWLAT_PBS_H
diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh
index 159cc5d70..bca42d583 100644
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -300,7 +300,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
   int ggsw_size = r * polynomial_size * (glwe_dimension + 1) *
                   (glwe_dimension + 1) * level_count;
 
-  double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), v_stream);
+  double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), *stream);
 
   batch_fft_ggsw_vector<Torus, STorus, params>(v_stream, d_ggsw_fft_in, ggsw_in,
                                                r, glwe_dimension,
@@ -311,7 +311,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
   // Allocate global memory in case parameters are too large
   char *d_mem;
   if (max_shared_memory < memory_needed_per_block) {
-    d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), v_stream);
+    d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), *stream);
   } else {
     checkCudaErrors(cudaFuncSetAttribute(
         device_batch_cmux<Torus, STorus, params, FULLSM>,
@@ -324,8 +324,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
   // Allocate buffers
   int glwe_size = (glwe_dimension + 1) * polynomial_size;
 
-  Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream);
-  Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream);
+  Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream);
+  Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream);
 
   checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
                                   num_lut * glwe_size * sizeof(Torus),
@@ -370,11 +370,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
   checkCudaErrors(cudaStreamSynchronize(*stream));
 
 // Free memory
-  cuda_drop_async(d_ggsw_fft_in, v_stream);
-  cuda_drop_async(d_buffer1, v_stream);
-  cuda_drop_async(d_buffer2, v_stream);
+  cuda_drop_async(d_ggsw_fft_in, *stream);
+  cuda_drop_async(d_buffer1, *stream);
+  cuda_drop_async(d_buffer2, *stream);
   if (max_shared_memory < memory_needed_per_block)
-    cuda_drop_async(d_mem, v_stream);
+    cuda_drop_async(d_mem, *stream);
 }
 
 // only works for big lwe for ks+bs case
diff --git a/src/device.cu b/src/device.cu
index 3e67e3cde..97fd70651 100644
--- a/src/device.cu
+++ b/src/device.cu
@@ -31,15 +31,14 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
   return ptr;
 }
 
-///
-void *cuda_malloc_async(uint64_t size, void *v_stream) {
+/// Allocates a size-byte array at the device memory. Tries to do it asynchronously.
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream) {
   void *ptr;
 
   #if (CUDART_VERSION < 11020)
   checkCudaErrors(cudaMalloc((void **)&ptr, size));
   #else
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
+  checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
   #endif
   return ptr;
 }
@@ -153,10 +152,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
   return 0;
 }
 
-/// Drop a cuda array
-int cuda_drop_async(void *ptr, void *v_stream) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  checkCudaErrors(cudaFreeAsync(ptr, *stream));
+/// Drop a cuda array. Tries to do it asynchronously
+int cuda_drop_async(void *ptr, cudaStream_t stream) {
+
+  #if (CUDART_VERSION < 11020)
+  checkCudaErrors(cudaFree(ptr));
+  #else
+  checkCudaErrors(cudaFreeAsync(ptr, stream));
+  #endif
   return 0;
 }