From a11a009df6ffcb6073ddb5a2073de92af61ab55a Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Mon, 27 Feb 2023 10:53:47 +0100 Subject: [PATCH] chore(cuda): replace synchronous copies/allocs with async ones in the private cuda backend --- src/wop_bootstrap.cu | 9 ++++++--- src/wop_bootstrap.cuh | 26 -------------------------- 2 files changed, 6 insertions(+), 29 deletions(-) diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu index 069c767e6..e21b3945c 100644 --- a/src/wop_bootstrap.cu +++ b/src/wop_bootstrap.cu @@ -511,7 +511,9 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, */ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer) { - cleanup_wop_pbs(v_stream, gpu_index, wop_pbs_buffer); + auto stream = static_cast(v_stream); + // Free memory + cuda_drop_async(*wop_pbs_buffer, stream, gpu_index); } /* @@ -521,6 +523,7 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index, void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer) { - cleanup_circuit_bootstrap_vertical_packing(v_stream, gpu_index, - cbs_vp_buffer); + auto stream = static_cast(v_stream); + // Free memory + cuda_drop_async(*cbs_vp_buffer, stream, gpu_index); } diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index e073327ce..f88b8b463 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -104,21 +104,6 @@ __host__ void scratch_circuit_bootstrap_vertical_packing( level_count_cbs, mbr_size, tau, max_shared_memory, false); } -/* - * Cleanup functions free the necessary data on the GPU and on the CPU. - * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus - * frees the data for the circuit bootstrap and vertical packing on GPU - * contained in cbs_vp_buffer - */ -__host__ void -cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index, - int8_t **cbs_vp_buffer) { - - auto stream = static_cast(v_stream); - // Free memory - cuda_drop_async(*cbs_vp_buffer, stream, gpu_index); -} - // number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP, // i.e. tau * p where tau is the number of LUTs (the original number of LWEs // before bit extraction) and p is the number of extracted bits @@ -269,17 +254,6 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer, max_shared_memory, false); } -/* - * Cleanup functions free the necessary data on the GPU and on the CPU. - * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus - * frees the data for the wop PBS on GPU in wop_pbs_buffer - */ -__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index, - int8_t **wop_pbs_buffer) { - auto stream = static_cast(v_stream); - cuda_drop_async(*wop_pbs_buffer, stream, gpu_index); -} - template __host__ void host_wop_pbs( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,