From 25f103f62d2df69cbcda8540b9712a869281c7e8 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 7 Nov 2022 12:45:25 -0300 Subject: [PATCH] feat(cuda): Refactor the low latency PBS to use asynchronous allocation. --- include/device.h | 4 ++-- src/bootstrap_amortized.cuh | 8 ++++---- src/bootstrap_low_latency.cuh | 11 +++++------ src/bootstrap_wop.cuh | 16 ++++++++-------- src/device.cu | 19 +++++++++++-------- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/include/device.h b/include/device.h index 4192815d4..f292a5c3e 100644 --- a/include/device.h +++ b/include/device.h @@ -7,7 +7,7 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index); void *cuda_malloc(uint64_t size, uint32_t gpu_index); -void *cuda_malloc_async(uint64_t size, void *v_stream); +void *cuda_malloc_async(uint64_t size, cudaStream_t stream); int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); @@ -28,7 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index); int cuda_drop(void *ptr, uint32_t gpu_index); -int cuda_drop_async(void *ptr, void *v_stream); +int cuda_drop_async(void *ptr, cudaStream_t stream); int cuda_get_max_shared_memory(uint32_t gpu_index); } diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index f67f44d12..641870f87 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -338,7 +338,7 @@ __host__ void host_bootstrap_amortized( // from one of three templates (no use, partial use or full use // of shared memory) if (max_shared_memory < SM_PART) { - d_mem = (char*) cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, v_stream); + d_mem = (char*) cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, *stream); device_bootstrap_amortized<<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, @@ -348,7 +348,7 @@ __host__ void host_bootstrap_amortized( cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART); cudaFuncSetCacheConfig(device_bootstrap_amortized, cudaFuncCachePreferShared); - d_mem = (char*) cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, v_stream); + d_mem = (char*) cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, *stream); device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, @@ -366,7 +366,7 @@ __host__ void host_bootstrap_amortized( checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); - d_mem = (char*) cuda_malloc_async(0, v_stream); + d_mem = (char*) cuda_malloc_async(0, *stream); device_bootstrap_amortized <<>>( @@ -377,7 +377,7 @@ __host__ void host_bootstrap_amortized( // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cuda_drop_async(d_mem, v_stream); + cuda_drop_async(d_mem, *stream); } template diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 9c6206079..f7c127853 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -10,6 +10,7 @@ #include "cooperative_groups.h" #include "../include/helper_cuda.h" +#include "device.h" #include "bootstrap.h" #include "complex/operations.cuh" #include "crypto/gadget.cuh" @@ -261,10 +262,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out, int buffer_size_per_gpu = level_count * num_samples * polynomial_size / 2 * sizeof(double2); - double2 *mask_buffer_fft; - double2 *body_buffer_fft; - checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu)); - checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu)); + double2 *mask_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream); + double2 *body_buffer_fft = (double2*) cuda_malloc_async(buffer_size_per_gpu, *stream); int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp sizeof(Torus) * polynomial_size + // accumulator @@ -298,8 +297,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out, // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cudaFree(mask_buffer_fft); - cudaFree(body_buffer_fft); + cuda_drop_async(mask_buffer_fft, *stream); + cuda_drop_async(body_buffer_fft, *stream); } #endif // LOWLAT_PBS_H diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index 159cc5d70..bca42d583 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -300,7 +300,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; - double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), v_stream); + double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), *stream); batch_fft_ggsw_vector(v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, @@ -311,7 +311,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate global memory in case parameters are too large char *d_mem; if (max_shared_memory < memory_needed_per_block) { - d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), v_stream); + d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), *stream); } else { checkCudaErrors(cudaFuncSetAttribute( device_batch_cmux, @@ -324,8 +324,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, // Allocate buffers int glwe_size = (glwe_dimension + 1) * polynomial_size; - Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream); - Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream); + Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); + Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, num_lut * glwe_size * sizeof(Torus), @@ -370,11 +370,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, checkCudaErrors(cudaStreamSynchronize(*stream)); // Free memory - cuda_drop_async(d_ggsw_fft_in, v_stream); - cuda_drop_async(d_buffer1, v_stream); - cuda_drop_async(d_buffer2, v_stream); + cuda_drop_async(d_ggsw_fft_in, *stream); + cuda_drop_async(d_buffer1, *stream); + cuda_drop_async(d_buffer2, *stream); if (max_shared_memory < memory_needed_per_block) - cuda_drop_async(d_mem, v_stream); + cuda_drop_async(d_mem, *stream); } // only works for big lwe for ks+bs case diff --git a/src/device.cu b/src/device.cu index 3e67e3cde..97fd70651 100644 --- a/src/device.cu +++ b/src/device.cu @@ -31,15 +31,14 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { return ptr; } -/// -void *cuda_malloc_async(uint64_t size, void *v_stream) { +/// Allocates a size-byte array at the device memory. Tries to do it asynchronously. +void *cuda_malloc_async(uint64_t size, cudaStream_t stream) { void *ptr; #if (CUDART_VERSION < 11020) checkCudaErrors(cudaMalloc((void **)&ptr, size)); #else - auto stream = static_cast(v_stream); - checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream)); + checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream)); #endif return ptr; } @@ -153,10 +152,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { return 0; } -/// Drop a cuda array -int cuda_drop_async(void *ptr, void *v_stream) { - auto stream = static_cast(v_stream); - checkCudaErrors(cudaFreeAsync(ptr, *stream)); +/// Drop a cuda array. Tries to do it asynchronously +int cuda_drop_async(void *ptr, cudaStream_t stream) { + + #if (CUDART_VERSION < 11020) + checkCudaErrors(cudaFree(ptr)); + #else + checkCudaErrors(cudaFreeAsync(ptr, stream)); + #endif return 0; }