From cf222e9176a385ca32bf81fb6ec7d553a11e6390 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 7 Nov 2022 12:37:48 -0300 Subject: [PATCH] feat(cuda): encapsulate asynchronous allocation methods. --- include/device.h | 4 ++++ src/bootstrap_wop.cuh | 49 +++++++++---------------------------------- src/device.cu | 20 ++++++++++++++++++ 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/include/device.h b/include/device.h index 4e2d8f2cf..4192815d4 100644 --- a/include/device.h +++ b/include/device.h @@ -7,6 +7,8 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index); void *cuda_malloc(uint64_t size, uint32_t gpu_index); +void *cuda_malloc_async(uint64_t size, void *v_stream); + int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size, @@ -26,5 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index); int cuda_drop(void *ptr, uint32_t gpu_index); +int cuda_drop_async(void *ptr, void *v_stream); + int cuda_get_max_shared_memory(uint32_t gpu_index); } diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index 04c40bb87..159cc5d70 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -4,6 +4,7 @@ #include "cooperative_groups.h" #include "../include/helper_cuda.h" +#include "device.h" #include "bootstrap.h" #include "bootstrap_low_latency.cuh" #include "complex/operations.cuh" @@ -296,17 +297,10 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, dim3 thds(polynomial_size / params::opt, 1, 1); ////////////////////// - double2 *d_ggsw_fft_in; int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; -#if (CUDART_VERSION < 11020) - checkCudaErrors( - cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double))); -#else - checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in, - ggsw_size * sizeof(double), *stream)); -#endif + double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), v_stream); batch_fft_ggsw_vector(v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, @@ -317,13 +311,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate global memory in case parameters are too large char *d_mem; if (max_shared_memory < memory_needed_per_block) { -#if (CUDART_VERSION < 11020) - checkCudaErrors( - cudaMalloc((void **)&d_mem, memory_needed_per_block * (1 << (r - 1)))); -#else - checkCudaErrors(cudaMallocAsync( - (void **)&d_mem, memory_needed_per_block * (1 << (r - 1)), *stream)); -#endif + d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), v_stream); } else { checkCudaErrors(cudaFuncSetAttribute( device_batch_cmux, @@ -335,19 +323,10 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate buffers int glwe_size = (glwe_dimension + 1) * polynomial_size; - Torus *d_buffer1, *d_buffer2; -#if (CUDART_VERSION < 11020) - checkCudaErrors( - cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus))); - checkCudaErrors( - cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus))); -#else - checkCudaErrors(cudaMallocAsync( - (void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream)); - checkCudaErrors(cudaMallocAsync( - (void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream)); -#endif + Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream); + Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream); + checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, num_lut * glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); @@ -391,19 +370,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, checkCudaErrors(cudaStreamSynchronize(*stream)); // Free memory -#if (CUDART_VERSION < 11020) - checkCudaErrors(cudaFree(d_ggsw_fft_in)); - checkCudaErrors(cudaFree(d_buffer1)); - checkCudaErrors(cudaFree(d_buffer2)); + cuda_drop_async(d_ggsw_fft_in, v_stream); + cuda_drop_async(d_buffer1, v_stream); + cuda_drop_async(d_buffer2, v_stream); if (max_shared_memory < memory_needed_per_block) - checkCudaErrors(cudaFree(d_mem)); -#else - checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream)); - checkCudaErrors(cudaFreeAsync(d_buffer1, *stream)); - checkCudaErrors(cudaFreeAsync(d_buffer2, *stream)); - if (max_shared_memory < memory_needed_per_block) - checkCudaErrors(cudaFreeAsync(d_mem, *stream)); -#endif + cuda_drop_async(d_mem, v_stream); } // only works for big lwe for ks+bs case diff --git a/src/device.cu b/src/device.cu index c3f8dd11b..3e67e3cde 100644 --- a/src/device.cu +++ b/src/device.cu @@ -31,6 +31,19 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { return ptr; } +/// +void *cuda_malloc_async(uint64_t size, void *v_stream) { + void *ptr; + + #if (CUDART_VERSION < 11020) + checkCudaErrors(cudaMalloc((void **)&ptr, size)); + #else + auto stream = static_cast(v_stream); + checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream)); + #endif + return ptr; +} + /// Checks that allocation is valid /// 0: valid /// -1: invalid, not enough memory in device @@ -140,6 +153,13 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { return 0; } +/// Drop a cuda array +int cuda_drop_async(void *ptr, void *v_stream) { + auto stream = static_cast(v_stream); + checkCudaErrors(cudaFreeAsync(ptr, *stream)); + return 0; +} + /// Get the maximum size for the shared memory int cuda_get_max_shared_memory(uint32_t gpu_index) { if (gpu_index >= cuda_get_number_of_gpus()) {