feat(cuda): encapsulate asynchronous allocation methods.

This commit is contained in:
Pedro Alves
2022-11-07 12:37:48 -03:00
committed by Agnès Leroy
parent 13e77b2d8c
commit cf222e9176
3 changed files with 34 additions and 39 deletions

View File

@@ -7,6 +7,8 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, void *v_stream);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
@@ -26,5 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, void *v_stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);
}

View File

@@ -4,6 +4,7 @@
#include "cooperative_groups.h"
#include "../include/helper_cuda.h"
#include "device.h"
#include "bootstrap.h"
#include "bootstrap_low_latency.cuh"
#include "complex/operations.cuh"
@@ -296,17 +297,10 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
dim3 thds(polynomial_size / params::opt, 1, 1);
//////////////////////
double2 *d_ggsw_fft_in;
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * level_count;
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
#else
checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in,
ggsw_size * sizeof(double), *stream));
#endif
double2 *d_ggsw_fft_in = (double2*) cuda_malloc_async(ggsw_size * sizeof(double), v_stream);
batch_fft_ggsw_vector<Torus, STorus, params>(v_stream, d_ggsw_fft_in, ggsw_in,
r, glwe_dimension,
@@ -317,13 +311,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
// Allocate global memory in case parameters are too large
char *d_mem;
if (max_shared_memory < memory_needed_per_block) {
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_mem, memory_needed_per_block * (1 << (r - 1))));
#else
checkCudaErrors(cudaMallocAsync(
(void **)&d_mem, memory_needed_per_block * (1 << (r - 1)), *stream));
#endif
d_mem = (char*) cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), v_stream);
} else {
checkCudaErrors(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
@@ -335,19 +323,10 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
// Allocate buffers
int glwe_size = (glwe_dimension + 1) * polynomial_size;
Torus *d_buffer1, *d_buffer2;
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
checkCudaErrors(
cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
#else
checkCudaErrors(cudaMallocAsync(
(void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream));
checkCudaErrors(cudaMallocAsync(
(void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream));
#endif
Torus *d_buffer1 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream);
Torus *d_buffer2 = (Torus*) cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), v_stream);
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
num_lut * glwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
@@ -391,19 +370,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
checkCudaErrors(cudaStreamSynchronize(*stream));
// Free memory
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaFree(d_ggsw_fft_in));
checkCudaErrors(cudaFree(d_buffer1));
checkCudaErrors(cudaFree(d_buffer2));
cuda_drop_async(d_ggsw_fft_in, v_stream);
cuda_drop_async(d_buffer1, v_stream);
cuda_drop_async(d_buffer2, v_stream);
if (max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFree(d_mem));
#else
checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer1, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer2, *stream));
if (max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFreeAsync(d_mem, *stream));
#endif
cuda_drop_async(d_mem, v_stream);
}
// only works for big lwe for ks+bs case

View File

@@ -31,6 +31,19 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
return ptr;
}
///
void *cuda_malloc_async(uint64_t size, void *v_stream) {
void *ptr;
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaMalloc((void **)&ptr, size));
#else
auto stream = static_cast<cudaStream_t *>(v_stream);
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
#endif
return ptr;
}
/// Checks that allocation is valid
/// 0: valid
/// -1: invalid, not enough memory in device
@@ -140,6 +153,13 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
return 0;
}
/// Drop a cuda array
int cuda_drop_async(void *ptr, void *v_stream) {
auto stream = static_cast<cudaStream_t *>(v_stream);
checkCudaErrors(cudaFreeAsync(ptr, *stream));
return 0;
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {