chore(cuda): replace synchronous copies/allocs with async ones in the private cuda backend

This commit is contained in:
Agnes Leroy
2023-02-27 10:53:47 +01:00
committed by Agnès Leroy
parent e7e6c8fb53
commit a11a009df6
2 changed files with 6 additions and 29 deletions

View File

@@ -511,7 +511,9 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
*/
void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer) {
cleanup_wop_pbs(v_stream, gpu_index, wop_pbs_buffer);
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
}
/*
@@ -521,6 +523,7 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
uint32_t gpu_index,
int8_t **cbs_vp_buffer) {
cleanup_circuit_bootstrap_vertical_packing(v_stream, gpu_index,
cbs_vp_buffer);
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
}

View File

@@ -104,21 +104,6 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
level_count_cbs, mbr_size, tau, max_shared_memory, false);
}
/*
* Cleanup functions free the necessary data on the GPU and on the CPU.
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
* frees the data for the circuit bootstrap and vertical packing on GPU
* contained in cbs_vp_buffer
*/
__host__ void
cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
int8_t **cbs_vp_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
}
// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
// before bit extraction) and p is the number of extracted bits
@@ -269,17 +254,6 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
max_shared_memory, false);
}
/*
* Cleanup functions free the necessary data on the GPU and on the CPU.
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
* frees the data for the wop PBS on GPU in wop_pbs_buffer
*/
__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index,
int8_t **wop_pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
}
template <typename Torus, typename STorus, class params>
__host__ void host_wop_pbs(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,