mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
fix(cuda): Checks the cudaDevAttrMemoryPoolsSupported property to ensure that asynchronous allocation is supported
This commit is contained in:
@@ -7,7 +7,7 @@ int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream);
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
@@ -28,7 +28,7 @@ int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream);
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
}
|
||||
|
||||
@@ -304,6 +304,8 @@ __host__ void host_bootstrap_amortized(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
uint32_t gpu_index = 0;
|
||||
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
sizeof(Torus) * polynomial_size + // accumulator mask rotated
|
||||
@@ -339,7 +341,7 @@ __host__ void host_bootstrap_amortized(
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
|
||||
*stream);
|
||||
*stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
@@ -350,7 +352,7 @@ __host__ void host_bootstrap_amortized(
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
|
||||
*stream);
|
||||
*stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
@@ -368,7 +370,7 @@ __host__ void host_bootstrap_amortized(
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
d_mem = (char *)cuda_malloc_async(0, *stream);
|
||||
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
@@ -379,7 +381,7 @@ __host__ void host_bootstrap_amortized(
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(d_mem, *stream);
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -258,14 +258,16 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors) {
|
||||
|
||||
uint32_t gpu_index = 0;
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu =
|
||||
level_count * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream);
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
double2 *body_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream);
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
|
||||
int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -299,8 +301,8 @@ host_bootstrap_low_latency(void *v_stream, Torus *lwe_array_out,
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(mask_buffer_fft, *stream);
|
||||
cuda_drop_async(body_buffer_fft, *stream);
|
||||
cuda_drop_async(mask_buffer_fft, *stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, *stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
|
||||
@@ -280,6 +280,8 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
|
||||
uint32_t level_count, uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
uint32_t gpu_index = 0;
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
int num_lut = (1 << r);
|
||||
|
||||
@@ -301,7 +303,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
|
||||
(glwe_dimension + 1) * level_count;
|
||||
|
||||
double2 *d_ggsw_fft_in =
|
||||
(double2 *)cuda_malloc_async(ggsw_size * sizeof(double), *stream);
|
||||
(double2 *)cuda_malloc_async(ggsw_size * sizeof(double), *stream, gpu_index);
|
||||
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(v_stream, d_ggsw_fft_in, ggsw_in,
|
||||
r, glwe_dimension,
|
||||
@@ -313,7 +315,7 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)),
|
||||
*stream);
|
||||
*stream, gpu_index);
|
||||
} else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
@@ -327,9 +329,9 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
Torus *d_buffer1 =
|
||||
(Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream);
|
||||
(Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
|
||||
Torus *d_buffer2 =
|
||||
(Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream);
|
||||
(Torus *)cuda_malloc_async(num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
|
||||
num_lut * glwe_size * sizeof(Torus),
|
||||
@@ -374,11 +376,11 @@ void host_cmux_tree(void *v_stream, Torus *glwe_array_out, Torus *ggsw_in,
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(d_ggsw_fft_in, *stream);
|
||||
cuda_drop_async(d_buffer1, *stream);
|
||||
cuda_drop_async(d_buffer2, *stream);
|
||||
cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
|
||||
cuda_drop_async(d_buffer1, *stream, gpu_index);
|
||||
cuda_drop_async(d_buffer2, *stream, gpu_index);
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
cuda_drop_async(d_mem, *stream);
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
}
|
||||
|
||||
// only works for big lwe for ks+bs case
|
||||
|
||||
@@ -33,14 +33,17 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream) {
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index) {
|
||||
void *ptr;
|
||||
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
|
||||
#endif
|
||||
int support_async_alloc;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&support_async_alloc, cudaDevAttrMemoryPoolsSupported,
|
||||
gpu_index));
|
||||
|
||||
if(support_async_alloc)
|
||||
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
|
||||
else
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -154,13 +157,16 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
/// Drop a cuda array. Tries to do it asynchronously
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream) {
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
#else
|
||||
checkCudaErrors(cudaFreeAsync(ptr, stream));
|
||||
#endif
|
||||
int support_async_alloc;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&support_async_alloc, cudaDevAttrMemoryPoolsSupported,
|
||||
gpu_index));
|
||||
|
||||
if(support_async_alloc)
|
||||
checkCudaErrors(cudaFreeAsync(ptr, stream));
|
||||
else
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user