chore(cuda): replace synchronous copies/allocs with async ones in the private cuda backend

2026-02-08 19:44:57 -05:00 · 2023-02-27 10:53:47 +01:00
parent e7e6c8fb53
commit a11a009df6
2 changed files with 6 additions and 29 deletions
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -511,7 +511,9 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
 */
 void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
                          int8_t **wop_pbs_buffer) {
-  cleanup_wop_pbs(v_stream, gpu_index, wop_pbs_buffer);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
 }

 /*
@@ -521,6 +523,7 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
 void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
                                                     uint32_t gpu_index,
                                                     int8_t **cbs_vp_buffer) {
-  cleanup_circuit_bootstrap_vertical_packing(v_stream, gpu_index,
-                                             cbs_vp_buffer);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
 }
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -104,21 +104,6 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
      level_count_cbs, mbr_size, tau, max_shared_memory, false);
 }

-/*
- * Cleanup functions free the necessary data on the GPU and on the CPU.
- * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
- * frees the data for the circuit bootstrap and vertical packing on GPU
- * contained in cbs_vp_buffer
- */
-__host__ void
-cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
-                                           int8_t **cbs_vp_buffer) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
-}
-
 // number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
 // i.e. tau * p where tau is the number of LUTs (the original number of LWEs
 // before bit extraction) and p is the number of extracted bits
@@ -269,17 +254,6 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
      max_shared_memory, false);
 }

-/*
- * Cleanup functions free the necessary data on the GPU and on the CPU.
- * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
- * frees the data for the wop PBS on GPU in wop_pbs_buffer
- */
-__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index,
-                              int8_t **wop_pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
-}
-
 template <typename Torus, typename STorus, class params>
 __host__ void host_wop_pbs(
    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,