From 80f4ca7338ce07a7ab07e467641cd194e2c95a2b Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Thu, 10 Nov 2022 09:47:09 -0300 Subject: [PATCH] fix(cuda): Checks the cudaDevAttrMemoryPoolsSupported property to ensure that asynchronous allocation is supported --- include/device.h | 4 ++-- src/bootstrap_amortized.cuh | 10 ++++++---- src/bootstrap_low_latency.cuh | 10 ++++++---- src/bootstrap_wop.cuh | 18 ++++++++++-------- src/device.cu | 30 ++++++++++++++++++------------ 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/include/device.h b/include/device.h index f292a5c3e..3dd11ebd6 100644 --- a/include/device.h +++ b/include/device.h @@ -7,7 +7,7 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index); void *cuda_malloc(uint64_t size, uint32_t gpu_index); -void *cuda_malloc_async(uint64_t size, cudaStream_t stream); +void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index); int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); @@ -28,7 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index); int cuda_drop(void *ptr, uint32_t gpu_index); -int cuda_drop_async(void *ptr, cudaStream_t stream); +int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index); int cuda_get_max_shared_memory(uint32_t gpu_index); } diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 40d6414a3..714ed760d 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -304,6 +304,8 @@ __host__ void host_bootstrap_amortized( uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t gpu_index = 0; + int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask sizeof(Torus) * polynomial_size + // accumulator body sizeof(Torus) * polynomial_size + // accumulator mask rotated @@ -339,7 +341,7 @@ __host__ void host_bootstrap_amortized( // of shared memory) if (max_shared_memory < SM_PART) { d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, - *stream); + *stream, gpu_index); device_bootstrap_amortized<<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, @@ -350,7 +352,7 @@ __host__ void host_bootstrap_amortized( cudaFuncSetCacheConfig(device_bootstrap_amortized, cudaFuncCachePreferShared); d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, - *stream); + *stream, gpu_index); device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, @@ -368,7 +370,7 @@ __host__ void host_bootstrap_amortized( checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); - d_mem = (char *)cuda_malloc_async(0, *stream); + d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index); device_bootstrap_amortized <<>>( @@ -379,7 +381,7 @@ __host__ void host_bootstrap_amortized( // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cuda_drop_async(d_mem, *stream); + cuda_drop_async(d_mem, *stream, gpu_index); } template diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 0610c1ab1..e9335d593 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -258,14 +258,16 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors) { + uint32_t gpu_index = 0; + auto stream = static_cast(v_stream); int buffer_size_per_gpu = level_count * num_samples * polynomial_size / 2 * sizeof(double2); double2 *mask_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream); + (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); double2 *body_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream); + (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp sizeof(Torus) * polynomial_size + // accumulator @@ -299,8 +301,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out, // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cuda_drop_async(mask_buffer_fft, *stream); - cuda_drop_async(body_buffer_fft, *stream); + cuda_drop_async(mask_buffer_fft, *stream, gpu_index); + cuda_drop_async(body_buffer_fft, *stream, gpu_index); } #endif // LOWLAT_PBS_H diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index fcd8ea3dc..6e7e27640 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -280,6 +280,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, uint32_t level_count, uint32_t r, uint32_t max_shared_memory) { + uint32_t gpu_index = 0; + auto stream = static_cast(v_stream); int num_lut = (1 << r); @@ -301,7 +303,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, (glwe_dimension + 1) * level_count; double2 *d_ggsw_fft_in = - (double2 *)cuda_malloc_async(ggsw_size * sizeof(double), *stream); + (double2 *)cuda_malloc_async(ggsw_size * sizeof(double), *stream, gpu_index); batch_fft_ggsw_vector(v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, @@ -313,7 +315,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, char *d_mem; if (max_shared_memory < memory_needed_per_block) { d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), - *stream); + *stream, gpu_index); } else { checkCudaErrors(cudaFuncSetAttribute( device_batch_cmux, @@ -327,9 +329,9 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, int glwe_size = (glwe_dimension + 1) * polynomial_size; Torus *d_buffer1 = - (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); + (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream, gpu_index); Torus *d_buffer2 = - (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream); + (Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream, gpu_index); checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, num_lut * glwe_size * sizeof(Torus), @@ -374,11 +376,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in, checkCudaErrors(cudaStreamSynchronize(*stream)); // Free memory - cuda_drop_async(d_ggsw_fft_in, *stream); - cuda_drop_async(d_buffer1, *stream); - cuda_drop_async(d_buffer2, *stream); + cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index); + cuda_drop_async(d_buffer1, *stream, gpu_index); + cuda_drop_async(d_buffer2, *stream, gpu_index); if (max_shared_memory < memory_needed_per_block) - cuda_drop_async(d_mem, *stream); + cuda_drop_async(d_mem, *stream, gpu_index); } // only works for big lwe for ks+bs case diff --git a/src/device.cu b/src/device.cu index 886bcbd61..974d3bd31 100644 --- a/src/device.cu +++ b/src/device.cu @@ -33,14 +33,17 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { /// Allocates a size-byte array at the device memory. Tries to do it /// asynchronously. -void *cuda_malloc_async(uint64_t size, cudaStream_t stream) { +void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index) { void *ptr; -#if (CUDART_VERSION < 11020) - checkCudaErrors(cudaMalloc((void **)&ptr, size)); -#else - checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream)); -#endif + int support_async_alloc; + checkCudaErrors(cudaDeviceGetAttribute(&support_async_alloc, cudaDevAttrMemoryPoolsSupported, + gpu_index)); + + if(support_async_alloc) + checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream)); + else + checkCudaErrors(cudaMalloc((void **)&ptr, size)); return ptr; } @@ -154,13 +157,16 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { } /// Drop a cuda array. Tries to do it asynchronously -int cuda_drop_async(void *ptr, cudaStream_t stream) { +int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) { -#if (CUDART_VERSION < 11020) - checkCudaErrors(cudaFree(ptr)); -#else - checkCudaErrors(cudaFreeAsync(ptr, stream)); -#endif + int support_async_alloc; + checkCudaErrors(cudaDeviceGetAttribute(&support_async_alloc, cudaDevAttrMemoryPoolsSupported, + gpu_index)); + + if(support_async_alloc) + checkCudaErrors(cudaFreeAsync(ptr, stream)); + else + checkCudaErrors(cudaFree(ptr)); return 0; }