mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
chore(cuda): replace synchronous copies/allocs with async ones in the private cuda backend
This commit is contained in:
@@ -511,7 +511,9 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
*/
|
||||
void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **wop_pbs_buffer) {
|
||||
cleanup_wop_pbs(v_stream, gpu_index, wop_pbs_buffer);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -521,6 +523,7 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
|
||||
void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **cbs_vp_buffer) {
|
||||
cleanup_circuit_bootstrap_vertical_packing(v_stream, gpu_index,
|
||||
cbs_vp_buffer);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -104,21 +104,6 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
|
||||
level_count_cbs, mbr_size, tau, max_shared_memory, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleanup functions free the necessary data on the GPU and on the CPU.
|
||||
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
|
||||
* frees the data for the circuit bootstrap and vertical packing on GPU
|
||||
* contained in cbs_vp_buffer
|
||||
*/
|
||||
__host__ void
|
||||
cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **cbs_vp_buffer) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
|
||||
// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
|
||||
// before bit extraction) and p is the number of extracted bits
|
||||
@@ -269,17 +254,6 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
max_shared_memory, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleanup functions free the necessary data on the GPU and on the CPU.
|
||||
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
|
||||
* frees the data for the wop PBS on GPU in wop_pbs_buffer
|
||||
*/
|
||||
__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **wop_pbs_buffer) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_wop_pbs(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
|
||||
Reference in New Issue
Block a user