From a11a009df6ffcb6073ddb5a2073de92af61ab55a Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Mon, 27 Feb 2023 10:53:47 +0100
Subject: [PATCH] chore(cuda): replace synchronous copies/allocs with async
 ones in the private cuda backend

---
 src/wop_bootstrap.cu  |  9 ++++++---
 src/wop_bootstrap.cuh | 26 --------------------------
 2 files changed, 6 insertions(+), 29 deletions(-)
diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu
index 069c767e6..e21b3945c 100644
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -511,7 +511,9 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
  */
 void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
                           int8_t **wop_pbs_buffer) {
-  cleanup_wop_pbs(v_stream, gpu_index, wop_pbs_buffer);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
 }
 
 /*
@@ -521,6 +523,7 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
 void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
                                                      uint32_t gpu_index,
                                                      int8_t **cbs_vp_buffer) {
-  cleanup_circuit_bootstrap_vertical_packing(v_stream, gpu_index,
-                                             cbs_vp_buffer);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
 }
diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh
index e073327ce..f88b8b463 100644
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -104,21 +104,6 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
       level_count_cbs, mbr_size, tau, max_shared_memory, false);
 }
 
-/*
- * Cleanup functions free the necessary data on the GPU and on the CPU.
- * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
- * frees the data for the circuit bootstrap and vertical packing on GPU
- * contained in cbs_vp_buffer
- */
-__host__ void
-cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
-                                           int8_t **cbs_vp_buffer) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
-}
-
 // number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
 // i.e. tau * p where tau is the number of LUTs (the original number of LWEs
 // before bit extraction) and p is the number of extracted bits
@@ -269,17 +254,6 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
       max_shared_memory, false);
 }
 
-/*
- * Cleanup functions free the necessary data on the GPU and on the CPU.
- * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
- * frees the data for the wop PBS on GPU in wop_pbs_buffer
- */
-__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index,
-                              int8_t **wop_pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
-}
-
 template <typename Torus, typename STorus, class params>
 __host__ void host_wop_pbs(
     void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,